From 08213f9093a906b4d1c118fac958b52f90c1ff9a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 23 Apr 2026 02:21:07 -0400 Subject: [PATCH 01/31] salience: add gRPC client + TLS plumbing for stateful vllm sessions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the client-side of a stateful gRPC protocol against vllm, plus the TLS trust machinery so we can talk to self-signed vllm servers. Protocol (proto/salience.proto): Bidi-streaming Session RPC carries OpenSession / AppendTokens / Generate / Cancel from client and SessionReady / PrefillProgress / Token / GenerateDone / Error from server. Separate Fork unary RPC for cheap branching (prefix cache shares KV automatically). Plus ListSessions, CloseSession, GetReadoutManifest admin RPCs. Per-token readouts ship as packed f32 ([n_layers * n_concepts] per token, flat). Logprobs use range-selected positions plus a top-k parameter — empty ranges means no logprobs, any range means emit sampled-token logprob at those positions, top_k > 0 adds alternatives. Client (src/agent/api/salience.rs): Tonic-generated types under pb::, a connect() helper, with_auth() for bearer metadata, and a Session handle wrapping the bidi stream: open() handshakes SessionReady; append() is fire-and-forget; generate() returns impl Stream that drains inbound until Done or terminating Error. One generate at a time per session. Peak picker (src/agent/salience.rs): Pure function over ReadoutEntry traces. Per-concept z-score against trace global stats; contiguous above-threshold regions emit one peak at the local max. Configurable sigma threshold and min-std safety floor. Deterministic tie-break on offset then concept name. 12 unit tests covering empty traces, flat channels, single/multi spikes, contiguous humps, multi-concept independence, trailing runs, sub-threshold noise, layer-out-of-range, manifest shape mismatch, and threshold tunability. TLS (src/agent/api/http.rs): HttpClient::build now also loads every .pem file under ~/.consciousness/certs/ into the rustls root store — so dropping a .pem in that directory is enough to trust a new self- signed server; no code changes per new host. Also installs the rustls default crypto provider explicitly via OnceLock: tonic's tls features pulled in both ring and aws-lc-rs on the resolver path, and rustls 0.23 refuses to auto-pick when either could win. Build (build.rs, Cargo.toml): tonic-build generates Rust types from proto/salience.proto at cargo-build time, using a vendored protoc binary (protoc-bin-vendored) so no system install is required. New runtime deps: tonic, prost, async-stream, tokio-stream, rustls-pemfile. Co-Authored-By: Proof of Concept --- Cargo.lock | 514 ++++++++++++++++++++++++++++++++++- Cargo.toml | 8 + build.rs | 17 ++ proto/salience.proto | 260 ++++++++++++++++++ src/agent/api/http.rs | 69 ++++- src/agent/api/mod.rs | 449 ++++-------------------------- src/agent/api/salience.rs | 249 +++++++++++++++++ src/agent/context.rs | 59 ++-- src/agent/mod.rs | 19 +- src/agent/salience.rs | 309 +++++++++++++++++++++ src/agent/tools/vision.rs | 13 +- src/lib.rs | 3 + src/logging.rs | 146 ++++++++++ src/subconscious/generate.rs | 13 +- src/user/mod.rs | 5 + 15 files changed, 1691 insertions(+), 442 deletions(-) create mode 100644 proto/salience.proto create mode 100644 src/agent/api/salience.rs create mode 100644 src/agent/salience.rs create mode 100644 src/logging.rs diff --git a/Cargo.lock b/Cargo.lock index 394168a..f88965a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -165,6 +165,39 @@ dependencies = [ "tree-sitter-yaml", ] +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "atomic" version = "0.6.1" @@ -208,6 +241,53 @@ dependencies = [ "fs_extra", ] +[[package]] +name = "axum" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +dependencies = [ + "async-trait", + "axum-core", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "sync_wrapper", + "tower 0.5.3", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper", + "tower-layer", + "tower-service", +] + [[package]] name = "base64" version = "0.13.1" @@ -491,6 +571,7 @@ dependencies = [ "anyhow", "ast-grep-core", "ast-grep-language", + "async-stream", "base64 0.22.1", "bytes", "capnp", @@ -518,11 +599,14 @@ dependencies = [ "notify-debouncer-mini", "paste", "peg", + "prost", + "protoc-bin-vendored", "ratatui", "redb", "regex", "rustls", "rustls-native-certs", + "rustls-pemfile", "serde", "serde_json", "serde_urlencoded", @@ -531,7 +615,10 @@ dependencies = [ "tokenizers", "tokio", "tokio-rustls", + "tokio-stream", "tokio-util", + "tonic", + "tonic-build", "tui-markdown", "tui-textarea-2", "uuid", @@ -1064,6 +1151,12 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + [[package]] name = "flate2" version = "1.1.9" @@ -1288,6 +1381,31 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "h2" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap 2.14.0", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + [[package]] name = "hashbrown" version = "0.15.5" @@ -1393,6 +1511,12 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + [[package]] name = "hyper" version = "1.9.0" @@ -1403,9 +1527,11 @@ dependencies = [ "bytes", "futures-channel", "futures-core", + "h2", "http", "http-body", "httparse", + "httpdate", "itoa", "pin-project-lite", "smallvec", @@ -1413,6 +1539,19 @@ dependencies = [ "want", ] +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.20" @@ -1420,11 +1559,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ "bytes", + "futures-channel", + "futures-util", "http", "http-body", "hyper", + "libc", "pin-project-lite", + "socket2 0.6.3", "tokio", + "tower-service", + "tracing", ] [[package]] @@ -1485,6 +1630,16 @@ version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09e54e57b4c48b40f7aec75635392b12b3421fa26fe8b4332e63138ed278459c" +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + [[package]] name = "indexmap" version = "2.14.0" @@ -1858,6 +2013,12 @@ dependencies = [ "xml5ever", ] +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + [[package]] name = "memchr" version = "2.8.0" @@ -1888,6 +2049,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -1938,6 +2105,12 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" + [[package]] name = "new_debug_unreachable" version = "1.0.6" @@ -2233,6 +2406,16 @@ dependencies = [ "sha2", ] +[[package]] +name = "petgraph" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +dependencies = [ + "fixedbitset 0.5.7", + "indexmap 2.14.0", +] + [[package]] name = "phf" version = "0.11.3" @@ -2285,6 +2468,26 @@ dependencies = [ "siphasher", ] +[[package]] +name = "pin-project" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "pin-project-lite" version = "0.2.17" @@ -2304,7 +2507,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "740ebea15c5d1428f910cd1a5f52cebf8d25006245ed8ade92702f4943d91e07" dependencies = [ "base64 0.22.1", - "indexmap", + "indexmap 2.14.0", "quick-xml", "serde", "time", @@ -2378,6 +2581,122 @@ dependencies = [ "yansi", ] +[[package]] +name = "prost" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" +dependencies = [ + "heck", + "itertools", + "log", + "multimap", + "once_cell", + "petgraph", + "prettyplease", + "prost", + "prost-types", + "regex", + "syn 2.0.117", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "prost-types" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" +dependencies = [ + "prost", +] + +[[package]] +name = "protoc-bin-vendored" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1c381df33c98266b5f08186583660090a4ffa0889e76c7e9a5e175f645a67fa" +dependencies = [ + "protoc-bin-vendored-linux-aarch_64", + "protoc-bin-vendored-linux-ppcle_64", + "protoc-bin-vendored-linux-s390_64", + "protoc-bin-vendored-linux-x86_32", + "protoc-bin-vendored-linux-x86_64", + "protoc-bin-vendored-macos-aarch_64", + "protoc-bin-vendored-macos-x86_64", + "protoc-bin-vendored-win32", +] + +[[package]] +name = "protoc-bin-vendored-linux-aarch_64" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c350df4d49b5b9e3ca79f7e646fde2377b199e13cfa87320308397e1f37e1a4c" + +[[package]] +name = "protoc-bin-vendored-linux-ppcle_64" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a55a63e6c7244f19b5c6393f025017eb5d793fd5467823a099740a7a4222440c" + +[[package]] +name = "protoc-bin-vendored-linux-s390_64" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dba5565db4288e935d5330a07c264a4ee8e4a5b4a4e6f4e83fad824cc32f3b0" + +[[package]] +name = "protoc-bin-vendored-linux-x86_32" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8854774b24ee28b7868cd71dccaae8e02a2365e67a4a87a6cd11ee6cdbdf9cf5" + +[[package]] +name = "protoc-bin-vendored-linux-x86_64" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b38b07546580df720fa464ce124c4b03630a6fb83e05c336fea2a241df7e5d78" + +[[package]] +name = "protoc-bin-vendored-macos-aarch_64" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89278a9926ce312e51f1d999fee8825d324d603213344a9a706daa009f1d8092" + +[[package]] +name = "protoc-bin-vendored-macos-x86_64" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81745feda7ccfb9471d7a4de888f0652e806d5795b61480605d4943176299756" + +[[package]] +name = "protoc-bin-vendored-win32" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95067976aca6421a523e491fce939a3e65249bac4b977adee0ee9771568e8aa3" + [[package]] name = "pulldown-cmark" version = "0.13.3" @@ -2433,6 +2752,8 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ + "libc", + "rand_chacha 0.3.1", "rand_core 0.6.4", ] @@ -2442,10 +2763,20 @@ version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ - "rand_chacha", + "rand_chacha 0.9.0", "rand_core 0.9.5", ] +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + [[package]] name = "rand_chacha" version = "0.9.0" @@ -2461,6 +2792,9 @@ name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", +] [[package]] name = "rand_core" @@ -2709,6 +3043,15 @@ dependencies = [ "security-framework", ] +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "rustls-pki-types" version = "1.14.0" @@ -2831,7 +3174,7 @@ version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" dependencies = [ - "indexmap", + "indexmap 2.14.0", "itoa", "memchr", "serde", @@ -2935,6 +3278,16 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7c388c1b5e93756d0c740965c41e8822f866621d41acbdf6336a6a168f8840c" +[[package]] +name = "socket2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "socket2" version = "0.6.3" @@ -3049,6 +3402,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" + [[package]] name = "syntect" version = "5.3.0" @@ -3127,7 +3486,7 @@ dependencies = [ "fancy-regex", "filedescriptor", "finl_unicode", - "fixedbitset", + "fixedbitset 0.4.2", "hex", "lazy_static", "libc", @@ -3287,7 +3646,7 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2", + "socket2 0.6.3", "tokio-macros", "windows-sys 0.61.2", ] @@ -3313,6 +3672,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + [[package]] name = "tokio-util" version = "0.7.18" @@ -3327,6 +3697,130 @@ dependencies = [ "tokio", ] +[[package]] +name = "tonic" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64 0.22.1", + "bytes", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "prost", + "rustls-native-certs", + "rustls-pemfile", + "socket2 0.5.10", + "tokio", + "tokio-rustls", + "tokio-stream", + "tower 0.4.13", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-build" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11" +dependencies = [ + "prettyplease", + "proc-macro2", + "prost-build", + "prost-types", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "indexmap 1.9.3", + "pin-project", + "pin-project-lite", + "rand 0.8.5", + "slab", + "tokio", + "tokio-util", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", +] + [[package]] name = "tree-sitter" version = "0.26.8" @@ -3885,7 +4379,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" dependencies = [ "anyhow", - "indexmap", + "indexmap 2.14.0", "wasm-encoder", "wasmparser", ] @@ -3898,7 +4392,7 @@ checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ "bitflags 2.11.0", "hashbrown 0.15.5", - "indexmap", + "indexmap 2.14.0", "semver", ] @@ -4267,7 +4761,7 @@ checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" dependencies = [ "anyhow", "heck", - "indexmap", + "indexmap 2.14.0", "prettyplease", "syn 2.0.117", "wasm-metadata", @@ -4298,7 +4792,7 @@ checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", "bitflags 2.11.0", - "indexmap", + "indexmap 2.14.0", "log", "serde", "serde_derive", @@ -4317,7 +4811,7 @@ checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" dependencies = [ "anyhow", "id-arena", - "indexmap", + "indexmap 2.14.0", "log", "semver", "serde", diff --git a/Cargo.toml b/Cargo.toml index 313dcd6..8a73852 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,6 +61,11 @@ futures = "0.3" capnp = "0.25" capnp-rpc = "0.25" +tonic = { version = "0.12", features = ["tls", "tls-roots"] } +prost = "0.13" +async-stream = "0.3" +tokio-stream = "0.1" + tokenizers = "0.22" http = "1" @@ -74,10 +79,13 @@ imagesize = "0.14" rustls = "0.23" tokio-rustls = "0.26" rustls-native-certs = "0.8" +rustls-pemfile = "2" serde_urlencoded = "0.7" [build-dependencies] capnpc = "0.25" +tonic-build = { version = "0.12", default-features = false, features = ["prost", "transport"] } +protoc-bin-vendored = "3" [lib] name = "consciousness" diff --git a/build.rs b/build.rs index 808bf31..5f77ae4 100644 --- a/build.rs +++ b/build.rs @@ -13,4 +13,21 @@ fn main() { .file("schema/channel.capnp") .run() .expect("capnp compile failed (channel.capnp)"); + + // Generate salience.v1 gRPC client + message types from proto. + // Server side (python) is generated separately via grpcio-tools. + // Use vendored protoc so we don't require a system install. + let protoc = protoc_bin_vendored::protoc_bin_path() + .expect("vendored protoc not available for this platform"); + // SAFETY: build script is single-threaded at this point; setting env + // before invoking tonic_build is the documented way to point it at a + // non-PATH protoc. + unsafe { std::env::set_var("PROTOC", protoc); } + tonic_build::configure() + .build_server(false) + .build_client(true) + .compile_protos(&["proto/salience.proto"], &["proto"]) + .expect("tonic_build compile failed (salience.proto)"); + + println!("cargo:rerun-if-changed=proto/salience.proto"); } diff --git a/proto/salience.proto b/proto/salience.proto new file mode 100644 index 0000000..01c0f1e --- /dev/null +++ b/proto/salience.proto @@ -0,0 +1,260 @@ +// salience.proto — stateful generation + per-token concept readout over gRPC. +// +// Shape: +// - One server-streaming RPC (Generate) for inference. Every other +// operation is unary. This is the minimum streaming we need — +// tokens arrive one at a time with optional readouts / logprobs — +// and keeping everything else unary makes the client dramatically +// simpler than a single bidi state machine did. +// +// - Server-side sessions hold the token list and image binaries. +// Sessions exist for bandwidth: at 200K tokens we'd otherwise +// re-ship ~800KB every turn, which hurts badly over a WAN link. +// vLLM's prefix cache holds the KV; the session just gives the +// client a handle so it can send deltas. +// +// - The client is the source of truth for prompt content. The server +// is the source of truth for image token expansion (how many +// IMAGE_PAD tokens an image becomes under this model). The client +// never writes vision tokens itself — AppendImage appends the whole +// <|vision_start|> + IMAGE_PAD×N + <|vision_end|> block server-side. +// +// - Every mutation carries (offset, truncating): the client's view of +// the server's current length, plus whether the client is deliberately +// rewriting history. Server validates on each call and rejects drift. +// No silent divergence, no migration bugs. +// +// - Errors use gRPC status codes. NOT_FOUND for missing sessions, +// FAILED_PRECONDITION for offset drift or image-block splits, +// RESOURCE_EXHAUSTED for context overflow, ABORTED for "session busy". +// +// Not in v1: +// - Authentication beyond a shared bearer token in gRPC metadata. +// - Multi-tenant session namespacing. +// - Sampling traces beyond top-k logprobs. + +syntax = "proto3"; + +package salience.v1; + +// ============================================================ +// Service +// ============================================================ + +service Salience { + // Create a fresh session. Client uses session_id on every subsequent + // RPC until CloseSession or TTL eviction (default 30 min idle). To + // refresh TTL across a long pause, issue a no-op Generate (empty + // append_tokens, max_tokens=0, no ranges). + rpc OpenSession(OpenSessionRequest) returns (OpenSessionResponse); + + // Release the session's tokens + images. Idempotent. + rpc CloseSession(CloseSessionRequest) returns (CloseSessionResponse); + + // Branch a session at a given token position. The new session + // inherits tokens [0, at_position) and any images whose vision + // block lies fully in that range. Rejected with FAILED_PRECONDITION + // if at_position falls inside an image block (client picks a clean + // boundary). + rpc ForkSession(ForkSessionRequest) returns (ForkSessionResponse); + + // Append an image to the session. Server decodes, runs vLLM's + // multimodal pipeline to compute N (IMAGE_PAD count), and writes + // the whole vision block into session.tokens. Returns N and the + // new total length. + rpc AppendImage(AppendImageRequest) returns (AppendImageResponse); + + // Prefill + optionally decode. See GenerateRequest for full + // semantics; stream yields Token events (with optional readouts / + // logprobs per position) followed by a terminating Done. + rpc Generate(GenerateRequest) returns (stream GenerateEvent); + + // Readout manifest for the currently-loaded model — concept names, + // layer indices, tensor dtype. Stateless; fetch once at client + // startup and cache. + rpc GetReadoutManifest(GetReadoutManifestRequest) returns (ReadoutManifest); +} + +// ============================================================ +// Lifecycle +// ============================================================ + +message OpenSessionRequest { + // Model identifier, must match vLLM's served model. The server + // only has one model loaded; this is a safety check on what the + // client thinks it's talking to. + string model = 1; +} + +message OpenSessionResponse { + string session_id = 1; + uint32 max_model_len = 2; +} + +message CloseSessionRequest { + string session_id = 1; +} + +message CloseSessionResponse {} + +message ForkSessionRequest { + string session_id = 1; // source session + uint32 at_position = 2; // new session inherits tokens [0, at_position) +} + +message ForkSessionResponse { + string session_id = 1; // new session +} + +// ============================================================ +// Mutation +// ============================================================ + +message AppendImageRequest { + string session_id = 1; + + // Image bytes (PNG / JPEG / WebP / …). + bytes data = 2; + + // MIME type, e.g. "image/png". + string mime = 3; + + // Client's view of the session's current token length. Must equal + // the server's actual length, OR be strictly less when + // truncating=true. Any mismatch is FAILED_PRECONDITION. + uint32 offset = 4; + + // If true, server truncates session.tokens to `offset` before + // appending. Rejected with FAILED_PRECONDITION if the truncation + // would split an image block. + bool truncating = 5; +} + +message AppendImageResponse { + // Count of <|image_pad|> tokens inside the vision block. Does not + // include the <|vision_start|> / <|vision_end|> bookends, which + // contribute one token each. + uint32 placeholder_count = 1; + + // Session's total token length after this append, including both + // bookends (= offset + placeholder_count + 2, barring truncation). + uint32 total_length = 2; +} + +// ============================================================ +// Inference +// ============================================================ + +message GenerateRequest { + string session_id = 1; + + // Tokens to append before prefill. May be empty. Client must NOT + // include vision tokens (<|vision_start|>, <|image_pad|>, + // <|vision_end|>) — those live in the session via AppendImage. + repeated uint32 append_tokens = 2; + + // Offset / truncating — same semantics as AppendImage. Truncation + // that splits an image block is FAILED_PRECONDITION. + uint32 offset = 3; + bool truncating = 4; + + // Decode budget. 0 = prefill only (no decode, emit Token events + // for positions covered by logprobs_ranges / readout_ranges, then + // Done; replaces the old /score endpoint). >0 = decode up to this + // many tokens, stopping early on EOS / stop_token_ids. + uint32 max_tokens = 5; + + // Position ranges (absolute, within the session's post-append + // token list) at which to emit logprobs on Token events. Empty = + // no logprobs. `logprob_top_k > 0` returns the top-k alternative + // tokens at each covered position; `logprob_top_k == 0` returns + // only the sampled-token's logprob. + repeated PositionRange logprobs_ranges = 6; + uint32 logprob_top_k = 7; + + // Position ranges at which to emit concept-readout vectors. Empty + // = no readouts. Logical shape per position is + // [n_layers][n_concepts] — see GetReadoutManifest. + repeated PositionRange readout_ranges = 8; + + // Sampling parameters. Meaningful only when max_tokens > 0. + float temperature = 9; // default 1.0 when zero + float top_p = 10; // default 1.0 when zero + uint32 top_k = 11; // default 0 (disabled) + repeated uint32 stop_token_ids = 12; + + // vLLM scheduler priority (0 = interactive, 10 = batch). + int32 priority = 13; +} + +message PositionRange { + uint32 start = 1; // inclusive + uint32 end = 2; // exclusive +} + +message GenerateEvent { + oneof event { + Token token = 1; + GenerateDone done = 2; + } +} + +message Token { + // Token id at this position. For prefill this is the prompt token; + // for decode it's the sampled token. + uint32 id = 1; + + // Absolute position in the session's token list. + uint32 position = 2; + + // True for prefill positions, false for decode. + bool is_prefill = 3; + + // Concept readout at this position. Empty if the position wasn't + // covered by readout_ranges. + repeated float readout = 4 [packed = true]; + + // Top-k alternative tokens' logprobs at this position — populated + // when the position is covered by logprobs_ranges and + // logprob_top_k > 0. + repeated TokenLogprob logprobs = 5; + + // Logprob of the token at `position` (the prompt token for + // prefill, the sampled token for decode). Populated when the + // position is covered by logprobs_ranges. + float sampled_logprob = 6; + bool has_sampled_logprob = 7; +} + +message TokenLogprob { + uint32 id = 1; + float logprob = 2; +} + +message GenerateDone { + uint32 prompt_tokens = 1; + uint32 completion_tokens = 2; + uint32 total_tokens = 3; + + enum FinishReason { + FINISH_REASON_UNSPECIFIED = 0; + FINISH_REASON_EOS = 1; // emitted EOS / stop token + FINISH_REASON_LENGTH = 2; // hit max_tokens + FINISH_REASON_CANCELLED = 3; // client cancelled + FINISH_REASON_STOP_STRING = 4; // matched a stop string + } + FinishReason finish_reason = 4; +} + +// ============================================================ +// Readout manifest +// ============================================================ + +message GetReadoutManifestRequest {} + +message ReadoutManifest { + repeated string concepts = 1; + repeated uint32 layers = 2; + uint32 hidden_size = 3; + string dtype = 4; +} diff --git a/src/agent/api/http.rs b/src/agent/api/http.rs index 429350b..65b759b 100644 --- a/src/agent/api/http.rs +++ b/src/agent/api/http.rs @@ -100,7 +100,7 @@ impl HttpClient { .map_err(|e| anyhow::anyhow!("invalid server name: {e}"))?; let connector = tokio_rustls::TlsConnector::from(self.tls.clone()); let tls = connector.connect(server_name.to_owned(), tcp).await - .context("TLS handshake")?; + .map_err(|e| anyhow::anyhow!("TLS handshake to {host}: {e}"))?; TokioIo::new(Box::new(tls) as Box) } else { TokioIo::new(Box::new(tcp) as Box) @@ -190,6 +190,7 @@ impl HttpClientBuilder { } pub fn build(self) -> HttpClient { + install_rustls_crypto_provider(); let certs = rustls_native_certs::load_native_certs() .certs.into_iter() .collect::>(); @@ -197,6 +198,13 @@ impl HttpClientBuilder { for cert in certs { root_store.add(cert).ok(); } + // Also trust any `.pem` files under `~/.consciousness/certs/` — + // self-signed server certs for our own vllm hosts live there. + // Drop a new `.pem` in the dir to trust a new server; no + // code change needed. + for cert in load_user_certs() { + root_store.add(cert).ok(); + } let tls = Arc::new( ClientConfig::builder() .with_root_certificates(root_store) @@ -210,6 +218,65 @@ impl HttpClientBuilder { } } +/// Install rustls' default crypto provider exactly once per process. +/// rustls 0.23 doesn't pick one automatically when multiple features +/// could provide it (e.g. when tonic pulls in both ring and aws-lc-rs +/// via transitive deps). Idempotent via OnceLock; safe to call from +/// multiple callers. +fn install_rustls_crypto_provider() { + static ONCE: std::sync::OnceLock<()> = std::sync::OnceLock::new(); + ONCE.get_or_init(|| { + let _ = rustls::crypto::ring::default_provider().install_default(); + }); +} + +/// Load every `.pem` file under `~/.consciousness/certs/` as a DER +/// certificate and return them. Silent on missing dir, missing files, +/// or parse errors — those are "no extra certs trusted" rather than +/// hard failures, to keep startup robust. +/// Load the concatenated PEM bytes of every `.pem` file under +/// `~/.consciousness/certs/` — suitable for passing to a tonic +/// `ClientTlsConfig::ca_certificate(Certificate::from_pem(...))` call +/// so gRPC connections trust the same self-signed servers the HTTP +/// path does. +pub(crate) fn load_user_certs_pem_bytes() -> Vec { + let mut out = Vec::new(); + let Some(home) = dirs::home_dir() else { return out }; + let dir = home.join(".consciousness").join("certs"); + let Ok(entries) = std::fs::read_dir(&dir) else { return out }; + for entry in entries.flatten() { + let path = entry.path(); + if path.extension().and_then(|e| e.to_str()) != Some("pem") { + continue; + } + if let Ok(bytes) = std::fs::read(&path) { + out.extend_from_slice(&bytes); + if !bytes.ends_with(b"\n") { + out.push(b'\n'); + } + } + } + out +} + +fn load_user_certs() -> Vec> { + let mut out = Vec::new(); + let Some(home) = dirs::home_dir() else { return out }; + let dir = home.join(".consciousness").join("certs"); + let Ok(entries) = std::fs::read_dir(&dir) else { return out }; + for entry in entries.flatten() { + let path = entry.path(); + if path.extension().and_then(|e| e.to_str()) != Some("pem") { + continue; + } + let Ok(bytes) = std::fs::read(&path) else { continue }; + for cert in rustls_pemfile::certs(&mut bytes.as_slice()).flatten() { + out.push(cert); + } + } + out +} + /// Trait alias for streams that work with hyper's IO adapter. trait IoStream: tokio::io::AsyncRead + tokio::io::AsyncWrite + Send + Unpin + 'static {} impl IoStream for T {} diff --git a/src/agent/api/mod.rs b/src/agent/api/mod.rs index be5e58e..06ecf70 100644 --- a/src/agent/api/mod.rs +++ b/src/agent/api/mod.rs @@ -7,13 +7,14 @@ // Set POC_DEBUG=1 for verbose per-turn logging. pub mod http; +pub mod salience; -use std::time::{Duration, Instant}; +use std::time::Duration; use anyhow::Result; use tokio::sync::mpsc; use serde::Deserialize; -use http::{HttpClient, HttpResponse}; +use http::HttpClient; #[derive(Debug, Clone, Deserialize)] pub struct Usage { @@ -48,6 +49,7 @@ impl Drop for AbortOnDrop { /// Sampling parameters for model generation. #[derive(Clone, Copy)] +#[allow(dead_code)] // fields used once Generate RPC lands in a later step pub(crate) struct SamplingParams { pub temperature: f32, pub top_p: f32, @@ -74,6 +76,10 @@ pub struct ApiClient { api_key: String, pub model: String, base_url: String, + /// Cached readout manifest — fetched once per process and shared + /// across ApiClient clones (every Agent/fork gets the same cell). + /// `None` after fetch means the server has readout disabled (404). + manifest: std::sync::Arc>>, } impl ApiClient { @@ -88,36 +94,30 @@ impl ApiClient { api_key: api_key.to_string(), model: model.to_string(), base_url: base_url.trim_end_matches('/').to_string(), + manifest: std::sync::Arc::new(tokio::sync::OnceCell::new()), } } - pub(crate) fn stream_completion_mm( + /// Stream generation via a gRPC session. Stubbed during the + /// unary-rewrite transition — the Generate RPC is wired in a + /// later step of this series. Until then, callers that reach + /// this path get a StreamToken::Error. + pub(crate) fn stream_session_mm( &self, - prompt_tokens: &[u32], - images: &[super::context::WireImage], - sampling: SamplingParams, - priority: Option, + _session_lock: std::sync::Arc>>, + _prompt_tokens: &[u32], + _images: &[super::context::WireImage], + _sampling: SamplingParams, + _priority: Option, ) -> (mpsc::UnboundedReceiver, AbortOnDrop) { let (tx, rx) = mpsc::unbounded_channel(); - let client = self.client.clone(); - let api_key = self.api_key.clone(); - let model = self.model.clone(); - let prompt_tokens = prompt_tokens.to_vec(); - let images: Vec<(Vec, String)> = images.iter() - .map(|i| (i.bytes.clone(), i.mime.clone())) - .collect(); - let base_url = self.base_url.clone(); - let handle = tokio::spawn(async move { - let result = stream_completions( - &client, &base_url, &api_key, &model, - &prompt_tokens, &images, &tx, sampling, priority, - ).await; - if let Err(e) = result { - let _ = tx.send(StreamToken::Error(e.to_string())); - } + let _ = tx.send(StreamToken::Error( + "Generate RPC not yet wired after protocol rewrite — see \ + proto/salience.proto; AppendImage / Generate land next." + .into(), + )); }); - (rx, AbortOnDrop(handle)) } @@ -128,386 +128,31 @@ impl ApiClient { /// readout is enabled on the server, `Ok(None)` on 404 (disabled), /// or an error on any other failure. /// - /// Call once at startup and cache the result; the manifest doesn't - /// change during a server run. + /// First call performs the HTTP fetch; subsequent calls (including + /// across ApiClient clones sharing the same cell) return the + /// cached result. The manifest doesn't change during a server run. pub async fn fetch_readout_manifest(&self) -> Result> { - let url = format!("{}/readout/manifest", self.base_url); - let auth = format!("Bearer {}", self.api_key); - let response = self - .client - .get_with_headers(&url, &[("Authorization", &auth)]) - .await - .map_err(|e| anyhow::anyhow!("readout manifest fetch ({}): {}", url, e))?; - let status = response.status(); - if status.as_u16() == 404 { - return Ok(None); - } - if !status.is_success() { - let body = response.text().await.unwrap_or_default(); - let n = body.floor_char_boundary(body.len().min(500)); - anyhow::bail!("readout manifest HTTP {} ({}): {}", status, url, &body[..n]); - } - Ok(Some(response.json().await?)) + let manifest = self.manifest.get_or_try_init(|| async { + let url = format!("{}/readout/manifest", self.base_url); + let auth = format!("Bearer {}", self.api_key); + let response = self + .client + .get_with_headers(&url, &[("Authorization", &auth)]) + .await + .map_err(|e| anyhow::anyhow!("readout manifest fetch ({}): {}", url, e))?; + let status = response.status(); + if status.as_u16() == 404 { + return Ok::<_, anyhow::Error>(None); + } + if !status.is_success() { + let body = response.text().await.unwrap_or_default(); + let n = body.floor_char_boundary(body.len().min(500)); + anyhow::bail!("readout manifest HTTP {} ({}): {}", status, url, &body[..n]); + } + Ok(Some(response.json().await?)) + }).await?; + Ok(manifest.clone()) } } -async fn stream_completions( - client: &HttpClient, - base_url: &str, - api_key: &str, - model: &str, - prompt_tokens: &[u32], - images: &[(Vec, String)], - tx: &mpsc::UnboundedSender, - sampling: SamplingParams, - priority: Option, -) -> anyhow::Result<()> { - let mut request = serde_json::json!({ - "model": model, - "prompt": prompt_tokens, - "max_tokens": 16384, - "temperature": sampling.temperature, - "top_p": sampling.top_p, - "top_k": sampling.top_k, - "stream": true, - "return_token_ids": true, - "skip_special_tokens": false, - "stop_token_ids": [super::tokenizer::IM_END], - }); - if !images.is_empty() { - use base64::Engine; - let b64 = base64::engine::general_purpose::STANDARD; - let uris: Vec = images.iter() - .map(|(bytes, mime)| format!("data:{};base64,{}", mime, b64.encode(bytes))) - .collect(); - request["multi_modal_data"] = serde_json::json!({ "image": uris }); - } - if let Some(p) = priority { - request["priority"] = serde_json::json!(p); - } - - let url = format!("{}/completions", base_url); - let debug_label = format!("{} prompt tokens, model={}", prompt_tokens.len(), model); - - let mut response = send_and_check( - client, &url, &request, - ("Authorization", &format!("Bearer {}", api_key)), - &[], &debug_label, None, - ).await?; - - let mut reader = SseReader::new(); - let mut usage = None; - - while let Some(event) = reader.next_event(&mut response).await? { - if let Some(err_msg) = event["error"]["message"].as_str() { - anyhow::bail!("API error in stream: {}", err_msg); - } - - if let Some(u) = event["usage"].as_object() { - if let Ok(u) = serde_json::from_value::(serde_json::Value::Object(u.clone())) { - usage = Some(u); - } - } - - let choices = match event["choices"].as_array() { - Some(c) => c, - None => continue, - }; - - for choice in choices { - // `readout`, if present, is a nested list - // `[num_tokens][n_layers][n_concepts]`. Parse it once per - // chunk and pair rows with token ids by index — the rows - // are in the same order as `token_ids`. - let readouts: Option> = choice["readout"] - .as_array() - .map(|outer| { - outer.iter().filter_map(|per_token| { - per_token.as_array().map(|layers| { - layers.iter().filter_map(|per_layer| { - per_layer.as_array().map(|vals| { - vals.iter() - .filter_map(|v| v.as_f64().map(|f| f as f32)) - .collect::>() - }) - }).collect::>>() - }) - }).collect() - }); - - if let Some(ids) = choice["token_ids"].as_array() { - for (i, id_val) in ids.iter().enumerate() { - if let Some(id) = id_val.as_u64() { - let readout = readouts - .as_ref() - .and_then(|r| r.get(i).cloned()); - let _ = tx.send(StreamToken::Token { - id: id as u32, - readout, - }); - } - } - } else if let Some(text) = choice["text"].as_str() { - // Fallback: provider didn't return token_ids, encode locally. - // No readout available in this path — the encoder may - // produce a different token count than the server did. - if !text.is_empty() { - for id in super::tokenizer::encode(text) { - let _ = tx.send(StreamToken::Token { id, readout: None }); - } - } - } - } - } - - let _ = tx.send(StreamToken::Done { usage }); - Ok(()) -} - -/// Send an HTTP request and check for errors. -pub(crate) async fn send_and_check( - client: &HttpClient, - url: &str, - body: &impl serde::Serialize, - auth_header: (&str, &str), - extra_headers: &[(&str, &str)], - debug_label: &str, - request_json: Option<&str>, -) -> Result { - let debug = std::env::var("POC_DEBUG").is_ok(); - let start = Instant::now(); - - if debug { - let payload_size = serde_json::to_string(body) - .map(|s| s.len()) - .unwrap_or(0); - dbglog!( - "request: {}K payload, {}", - payload_size / 1024, debug_label, - ); - } - - let mut headers: Vec<(&str, &str)> = Vec::with_capacity(extra_headers.len() + 1); - headers.push(auth_header); - headers.extend_from_slice(extra_headers); - - let response = client - .send_json("POST", url, &headers, body) - .await - .map_err(|e| { - let msg = e.to_string(); - let cause = if msg.contains("connect timeout") || msg.contains("TCP connect") { - "connection refused" - } else if msg.contains("request timeout") { - "request timed out" - } else { - "request error" - }; - anyhow::anyhow!("{} ({}): {}", cause, url, msg) - })?; - - let status = response.status(); - let elapsed = start.elapsed(); - - if debug { - for name in [ - "x-ratelimit-remaining", - "x-ratelimit-limit", - "x-request-id", - ] { - if let Some(val) = response.header(name) { - dbglog!("header {}: {}", name, val); - } - } - } - - if !status.is_success() { - let body = response.text().await.unwrap_or_default(); - dbglog!( - "HTTP {} after {:.1}s ({}): {}", - status, - elapsed.as_secs_f64(), - url, - &body[..body.floor_char_boundary(body.len().min(500))] - ); - if let Some(json) = request_json { - let log_dir = dirs::home_dir() - .unwrap_or_default() - .join(".consciousness/logs/failed-requests"); - let _ = std::fs::create_dir_all(&log_dir); - let ts = chrono::Local::now().format("%Y%m%dT%H%M%S"); - let path = log_dir.join(format!("{}.json", ts)); - if std::fs::write(&path, json).is_ok() { - dbglog!( - "saved failed request to {} (HTTP {})", path.display(), status - ); - } - } - anyhow::bail!("HTTP {} ({}): {}", status, url, &body[..body.floor_char_boundary(body.len().min(1000))]); - } - - if debug { - dbglog!( - "connected in {:.1}s (HTTP {})", - elapsed.as_secs_f64(), - status.as_u16() - ); - } - - Ok(response) -} - -/// SSE stream reader. Handles the generic SSE plumbing shared by both -/// backends: chunk reading with timeout, line buffering, `data:` prefix -/// stripping, `[DONE]` detection, JSON parsing, and parse error diagnostics. -/// Yields parsed events as serde_json::Value — each backend handles its -/// own event types. -pub(crate) struct SseReader { - line_buf: String, - chunk_timeout: Duration, - pub stream_start: Instant, - pub chunks_received: u64, - pub sse_lines_parsed: u64, - pub sse_parse_errors: u64, - debug: bool, - done: bool, - /// Serialized request payload — saved to disk on errors for replay debugging. - pub(crate) request_json: Option, -} - -impl SseReader { - pub(crate) fn new() -> Self { - Self { - line_buf: String::new(), - chunk_timeout: Duration::from_secs(crate::config::get().api_stream_timeout_secs), - stream_start: Instant::now(), - chunks_received: 0, - sse_lines_parsed: 0, - sse_parse_errors: 0, - debug: std::env::var("POC_DEBUG").is_ok(), - done: false, - request_json: None, - } - } - - /// Attach the serialized request payload for error diagnostics. - /// Save the request payload to disk for replay debugging. - fn save_failed_request(&self, reason: &str) { - let Some(ref json) = self.request_json else { return }; - let log_dir = dirs::home_dir() - .unwrap_or_default() - .join(".consciousness/logs/failed-requests"); - let _ = std::fs::create_dir_all(&log_dir); - let ts = chrono::Local::now().format("%Y%m%dT%H%M%S"); - let path = log_dir.join(format!("{}.json", ts)); - if std::fs::write(&path, json).is_ok() { - dbglog!( - "saved failed request to {} ({})", path.display(), reason - ); - } - } - - /// Read the next SSE event from the response stream. - /// Returns Ok(Some(value)) for each parsed data line, - /// Ok(None) when the stream ends or [DONE] is received. - pub(crate) async fn next_event( - &mut self, - response: &mut HttpResponse, - ) -> Result> { - loop { - // Drain complete lines from the buffer before reading more chunks - while let Some(newline_pos) = self.line_buf.find('\n') { - let line = self.line_buf[..newline_pos].trim().to_string(); - self.line_buf = self.line_buf[newline_pos + 1..].to_string(); - - if line == "data: [DONE]" { - self.done = true; - return Ok(None); - } - if line.is_empty() - || line.starts_with("event: ") - || !line.starts_with("data: ") - { - continue; - } - - let json_str = &line[6..]; - self.sse_lines_parsed += 1; - - match serde_json::from_str(json_str) { - Ok(v) => return Ok(Some(v)), - Err(e) => { - self.sse_parse_errors += 1; - if self.sse_parse_errors == 1 || self.debug { - let preview = if json_str.len() > 200 { - format!("{}...", &json_str[..200]) - } else { - json_str.to_string() - }; - dbglog!( - "SSE parse error (#{}) {}: {}", - self.sse_parse_errors, e, preview - ); - } - continue; - } - } - } - - if self.done { - return Ok(None); - } - - // Read more data from the response stream - match tokio::time::timeout(self.chunk_timeout, response.chunk()).await { - Ok(Ok(Some(chunk))) => { - self.chunks_received += 1; - self.line_buf.push_str(&String::from_utf8_lossy(&chunk)); - } - Ok(Ok(None)) => return Ok(None), - Ok(Err(e)) => { - let buf_preview = if self.line_buf.is_empty() { - "(empty)".to_string() - } else { - let n = self.line_buf.len().min(500); - format!("{}B: {}", self.line_buf.len(), &self.line_buf[..n]) - }; - let msg = format!( - "stream error after {} chunks, {:.1}s, {} sse lines: {} | buf: {}", - self.chunks_received, - self.stream_start.elapsed().as_secs_f64(), - self.sse_lines_parsed, - e, buf_preview, - ); - dbglog!("{}", msg); - self.save_failed_request(&msg); - return Err(e.into()); - } - Err(_) => { - let buf_preview = if self.line_buf.is_empty() { - "(empty)".to_string() - } else { - let n = self.line_buf.len().min(500); - format!("{}B: {}", self.line_buf.len(), &self.line_buf[..n]) - }; - let msg = format!( - "stream timeout: {}s, {} chunks, {} sse lines, {:.1}s elapsed | buf: {}", - self.chunk_timeout.as_secs(), - self.chunks_received, - self.sse_lines_parsed, - self.stream_start.elapsed().as_secs_f64(), - buf_preview, - ); - dbglog!("{}", msg); - self.save_failed_request(&msg); - anyhow::bail!( - "stream timeout: no data for {}s ({} chunks received)", - self.chunk_timeout.as_secs(), - self.chunks_received - ); - } - } - } - } -} diff --git a/src/agent/api/salience.rs b/src/agent/api/salience.rs new file mode 100644 index 0000000..f9ea83d --- /dev/null +++ b/src/agent/api/salience.rs @@ -0,0 +1,249 @@ +// agent/api/salience.rs — gRPC client bindings for salience.v1. +// +// Thin wrapper around the tonic-generated types. Every RPC except +// Generate is unary; Generate is server-streaming. Free functions +// (open/close session) wrap the lifecycle RPCs; `SessionHandle` just +// carries the id + connection params so later RPCs can reuse them. +// +// The old bidi Session() API is gone — see git history for its shape. + +#![allow(clippy::enum_variant_names)] + +use anyhow::{Context, Result}; +use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint}; + +/// Generated prost + tonic types for salience.v1. Call sites use +/// `pb::OpenSessionRequest`, `pb::Token`, etc. +pub mod pb { + tonic::include_proto!("salience.v1"); +} + +pub type SalienceClient = pb::salience_client::SalienceClient; + +/// Open a TLS-aware gRPC channel to the salience server. `base_url` +/// looks like `https://host:8443`. User-provided CA certs under +/// `~/.consciousness/certs/` are trusted in addition to the system +/// roots (for self-signed server certs). +pub async fn connect(base_url: &str) -> Result { + let mut endpoint = Endpoint::from_shared(base_url.to_string()) + .with_context(|| format!("invalid salience endpoint: {}", base_url))? + .connect_timeout(std::time::Duration::from_secs(30)) + .timeout(std::time::Duration::from_secs(600)); + + if base_url.starts_with("https://") { + let user_certs = super::http::load_user_certs_pem_bytes(); + let mut tls = ClientTlsConfig::new().with_native_roots(); + if !user_certs.is_empty() { + tls = tls.ca_certificate(Certificate::from_pem(user_certs)); + } + endpoint = endpoint + .tls_config(tls) + .with_context(|| "configuring tonic TLS")?; + } + + let channel = endpoint + .connect() + .await + .with_context(|| format!("failed to connect to salience server at {}", base_url))?; + Ok(pb::salience_client::SalienceClient::new(channel)) +} + +/// Derive the gRPC base URL from the HTTP completions base URL. +/// +/// vLLM's salience gRPC server listens on a different port (8443) from +/// the HTTP endpoint (8000) and accepts no path component. Given an +/// HTTP base like `https://host:8000/v1`, produce `https://host:8443`. +/// No-op when the path is empty and the port isn't 8000. +pub fn derive_grpc_url(http_base: &str) -> String { + let mut url = http_base.trim_end_matches('/').to_string(); + if let Some(proto_end) = url.find("://") { + let rest_start = proto_end + 3; + if let Some(path_slash) = url[rest_start..].find('/') { + url.truncate(rest_start + path_slash); + } + } + url.replace(":8000", ":8443") +} + +/// Attach a bearer token to a tonic request as gRPC metadata. +pub fn with_auth(req: &mut tonic::Request, api_key: &str) { + if api_key.is_empty() { + return; + } + let bearer = format!("Bearer {}", api_key); + if let Ok(val) = bearer.parse() { + req.metadata_mut().insert("authorization", val); + } +} + +/// Call the server's `OpenSession` RPC and return the response. +pub async fn open_session( + base_url: &str, + api_key: &str, + model: &str, +) -> Result { + let mut client = connect(base_url).await?; + let mut req = tonic::Request::new(pb::OpenSessionRequest { + model: model.to_string(), + }); + with_auth(&mut req, api_key); + let resp = client + .open_session(req) + .await + .with_context(|| "OpenSession RPC failed")?; + Ok(resp.into_inner()) +} + +/// Call the server's `CloseSession` RPC. Idempotent on the server. +pub async fn close_session(base_url: &str, api_key: &str, session_id: &str) -> Result<()> { + let mut client = connect(base_url).await?; + let mut req = tonic::Request::new(pb::CloseSessionRequest { + session_id: session_id.to_string(), + }); + with_auth(&mut req, api_key); + client + .close_session(req) + .await + .with_context(|| "CloseSession RPC failed")?; + Ok(()) +} + +/// Append an image to a session. Server decodes the image, computes N +/// via vLLM's own multimodal pipeline, writes the full vision block +/// (`<|vision_start|> + IMAGE_PAD×N + <|vision_end|>`) into +/// session.tokens, and returns (N, new total length). +/// +/// `offset` is the client's view of the session's current token count; +/// the server rejects if it diverges from its own (unless +/// `truncating=true`, in which case the server slices to `offset` +/// first — but never through a vision block). +pub async fn append_image( + base_url: &str, + api_key: &str, + session_id: &str, + data: Vec, + mime: String, + offset: u32, + truncating: bool, +) -> Result { + let mut client = connect(base_url).await?; + let mut req = tonic::Request::new(pb::AppendImageRequest { + session_id: session_id.to_string(), + data, + mime, + offset, + truncating, + }); + with_auth(&mut req, api_key); + let resp = client + .append_image(req) + .await + .with_context(|| "AppendImage RPC failed")?; + Ok(resp.into_inner()) +} + +/// Handle to a server-side session. Carries the id + connection params +/// so subsequent per-session RPCs (AppendImage, Generate, ForkSession) +/// can be issued without the caller juggling base_url / api_key each +/// time. +pub struct SessionHandle { + pub session_id: String, + pub max_model_len: u32, + pub base_url: String, + pub api_key: String, +} + +impl SessionHandle { + pub async fn open(base_url: &str, api_key: &str, model: &str) -> Result { + let grpc_url = derive_grpc_url(base_url); + log::debug!(target: "grpc", + "SessionHandle::open http_base={} -> grpc_url={}", + base_url, grpc_url); + let resp = open_session(&grpc_url, api_key, model).await?; + log::debug!(target: "grpc", + "SessionHandle::open session_id={} max_model_len={}", + resp.session_id, resp.max_model_len); + Ok(Self { + session_id: resp.session_id, + max_model_len: resp.max_model_len, + base_url: grpc_url, + api_key: api_key.to_string(), + }) + } + + pub async fn close(self) -> Result<()> { + close_session(&self.base_url, &self.api_key, &self.session_id).await + } + + /// Append an image via the server-side vision block. See + /// `append_image` free function for full semantics. + pub async fn append_image( + &self, + data: Vec, + mime: String, + offset: u32, + truncating: bool, + ) -> Result { + append_image( + &self.base_url, + &self.api_key, + &self.session_id, + data, + mime, + offset, + truncating, + ) + .await + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn generated_types_compile() { + // Exercise the shape of the new proto types — if build.rs + // stops regenerating against the proto, this stops compiling. + let _open = pb::OpenSessionRequest { + model: "qwen3-vl".into(), + }; + let _tok = pb::Token { + id: 42, + position: 0, + is_prefill: false, + readout: vec![0.1, 0.2, 0.3], + logprobs: vec![pb::TokenLogprob { + id: 1, + logprob: -0.5, + }], + sampled_logprob: -0.1, + has_sampled_logprob: true, + }; + let _done = pb::GenerateDone { + prompt_tokens: 10, + completion_tokens: 20, + total_tokens: 30, + finish_reason: pb::generate_done::FinishReason::Eos as i32, + }; + let _evt = pb::GenerateEvent { + event: Some(pb::generate_event::Event::Done(_done)), + }; + } + + #[test] + fn derive_grpc_url_cases() { + assert_eq!( + derive_grpc_url("https://host:8000/v1"), + "https://host:8443", + ); + assert_eq!( + derive_grpc_url("https://host:8000/"), + "https://host:8443", + ); + assert_eq!( + derive_grpc_url("https://host:9000/v1"), + "https://host:9000", + ); + } +} diff --git a/src/agent/context.rs b/src/agent/context.rs index 2009cfc..ab21e21 100644 --- a/src/agent/context.rs +++ b/src/agent/context.rs @@ -359,8 +359,8 @@ impl AstNode { mime: impl Into, orig_height: u32, orig_width: u32, + token_count: u32, ) -> Self { - let token_count = qwen3_image_token_count(orig_height, orig_width); Self::Leaf(NodeLeaf::new(NodeBody::Image { bytes, mime: mime.into(), @@ -898,10 +898,12 @@ impl Ast for ContextState { } /// An image collected from the AST for a request body. The AST stores -/// the pre-expanded token form (N image_pads) for accurate budget -/// accounting; the wire form collapses each Image to a single -/// `<|image_pad|>` between vision bookends and ships the bytes -/// separately as multi_modal_data. +/// the pre-expanded token form (`<|vision_start|> + <|image_pad|>×N + +/// <|vision_end|>`), and the wire form mirrors that exactly so the +/// server's `session.tokens` length matches what vLLM's engine will +/// process. The authoritative N is obtained from the server via the +/// CountImageTokens RPC before the Image leaf is constructed. +#[derive(Clone)] pub struct WireImage { pub bytes: Vec, pub mime: String, @@ -911,9 +913,10 @@ fn wire_into(node: &AstNode, tokens: &mut Vec, images: &mut Vec) match node { AstNode::Leaf(leaf) => match leaf.body() { NodeBody::Image { bytes, mime, .. } => { - tokens.push(tokenizer::VISION_START); - tokens.push(tokenizer::IMAGE_PAD); - tokens.push(tokenizer::VISION_END); + // Send the pre-expanded token form (includes N + // <|image_pad|> tokens); engine's multi_modal + // pipeline pairs them with the binary data below. + tokens.extend_from_slice(leaf.token_ids()); images.push(WireImage { bytes: bytes.clone(), mime: mime.clone(), @@ -1225,11 +1228,20 @@ impl ContextState { // to at request time. Constants come from Qwen3.5-27B's preprocessor_config. // --------------------------------------------------------------------------- +// Test-only client-side estimate of image token expansion. Production +// callers obtain the authoritative count from the server via +// CountImageTokens; these constants and helpers stay around only to +// keep the context-shape unit tests self-contained. +#[cfg(test)] const QWEN3_PATCH_SIZE: u32 = 16; +#[cfg(test)] const QWEN3_MERGE_SIZE: u32 = 2; +#[cfg(test)] const QWEN3_MIN_PIXELS: u64 = 65_536; +#[cfg(test)] const QWEN3_MAX_PIXELS: u64 = 16_777_216; +#[cfg(test)] fn smart_resize(h: u32, w: u32, factor: u32, min_pixels: u64, max_pixels: u64) -> (u32, u32) { let max_s = h.max(w) as f64; let min_s = h.min(w) as f64; @@ -1258,10 +1270,10 @@ fn smart_resize(h: u32, w: u32, factor: u32, min_pixels: u64, max_pixels: u64) - } } -/// Compute how many `<|image_pad|>` tokens vLLM will emit for an image of -/// the given dimensions. Matches Qwen3VL's feature-size calculation exactly: -/// (grid_h * grid_w) / merge_size^2 -/// where (grid_h, grid_w) = resized dims / patch_size. +/// Test-only: client-side estimate of how many `<|image_pad|>` tokens +/// vLLM will emit for an image of the given dimensions. Production +/// callers use `salience::count_image_tokens` (server-authoritative). +#[cfg(test)] fn qwen3_image_token_count(orig_h: u32, orig_w: u32) -> u32 { let factor = QWEN3_PATCH_SIZE * QWEN3_MERGE_SIZE; let (rh, rw) = smart_resize(orig_h, orig_w, factor, QWEN3_MIN_PIXELS, QWEN3_MAX_PIXELS); @@ -1697,7 +1709,7 @@ mod tests { #[test] fn test_image_render_and_token_ids() { - let node = AstNode::image(vec![0u8, 1, 2, 3], "image/png", 512, 512); + let node = AstNode::image(vec![0u8, 1, 2, 3], "image/png", 512, 512, qwen3_image_token_count(512, 512)); let leaf = node.leaf().unwrap(); // 3 tokens of bookend + 256 image_pad tokens assert_eq!(leaf.token_ids().len(), 258); @@ -1713,36 +1725,41 @@ mod tests { } #[test] - fn test_wire_prompt_collapses_image_pads() { + fn test_wire_prompt_preserves_expanded_image_pads() { let mut ctx = ContextState::new(); ctx.push_no_log(Section::Conversation, AstNode::branch(Role::User, vec![ AstNode::content("look:"), - AstNode::image(vec![0xDE, 0xAD], "image/png", 512, 512), + AstNode::image(vec![0xDE, 0xAD], "image/png", 512, 512, qwen3_image_token_count(512, 512)), ])); - // AST side: N image_pads + bookends, full budget accounting. + // AST side and wire side should both carry N image_pads + bookends — + // server's session.tokens length must match what vLLM's engine will + // actually process. Binary image bytes are shipped separately in + // multi_modal_data via the WireImage list. + let n_expected = qwen3_image_token_count(512, 512) as usize; + let full = ctx.token_ids(); let n_image_pads_full = full.iter() .filter(|&&t| t == tokenizer::IMAGE_PAD).count(); - assert_eq!(n_image_pads_full, qwen3_image_token_count(512, 512) as usize); + assert_eq!(n_image_pads_full, n_expected); - // Wire side: single image_pad, bytes moved to images list. let (wire, images, _) = ctx.wire_prompt(0..ctx.conversation().len(), |_| false); let n_image_pads_wire = wire.iter() .filter(|&&t| t == tokenizer::IMAGE_PAD).count(); - assert_eq!(n_image_pads_wire, 1); + assert_eq!(n_image_pads_wire, n_expected); + assert_eq!(images.len(), 1); assert_eq!(images[0].bytes, vec![0xDE, 0xAD]); assert_eq!(images[0].mime, "image/png"); - // vision_start/vision_end bookends are preserved in wire form. + // One pair of vision_start/vision_end bookends around the N pads. assert_eq!(wire.iter().filter(|&&t| t == tokenizer::VISION_START).count(), 1); assert_eq!(wire.iter().filter(|&&t| t == tokenizer::VISION_END).count(), 1); } #[test] fn test_image_serde_roundtrip() { - let node = AstNode::image(vec![0xDE, 0xAD, 0xBE, 0xEF], "image/png", 64, 64); + let node = AstNode::image(vec![0xDE, 0xAD, 0xBE, 0xEF], "image/png", 64, 64, qwen3_image_token_count(64, 64)); let json = serde_json::to_string(&node).unwrap(); // bytes must be base64-encoded in the JSON form assert!(json.contains("3q2+7w==")); diff --git a/src/agent/mod.rs b/src/agent/mod.rs index 2c3a98a..6a55f3f 100644 --- a/src/agent/mod.rs +++ b/src/agent/mod.rs @@ -17,6 +17,7 @@ pub mod api; pub mod context; pub mod oneshot; pub mod readout; +pub mod salience; pub mod tokenizer; pub mod tools; @@ -148,6 +149,14 @@ pub struct Agent { /// token handler, read by UI screens (amygdala). Manifest is /// `None` when the server has readout disabled. pub readout: readout::SharedReadoutBuffer, + /// Long-lived gRPC session to the salience server, lazily opened + /// on first use. Tracks appended tokens so subsequent turns send + /// only the delta (prefix-cache reuse). None when not yet opened + /// or when the session has died and needs reopening. + /// + /// Arc-wrapped so the spawned streaming task can share ownership + /// (the task outlives the call site). + pub grpc_session: std::sync::Arc>>, } /// Mutable agent state — behind its own mutex. @@ -224,6 +233,7 @@ impl Agent { session_id, context: crate::Mutex::new(context), readout, + grpc_session: std::sync::Arc::new(crate::Mutex::new(None)), state: crate::Mutex::new(AgentState { tools: agent_tools, mcp_tools: McpToolAccess::All, @@ -292,6 +302,9 @@ impl Agent { // shouldn't bleed into the main emotional readout even // though they hit the same vLLM server. readout: readout::new_shared(), + // Forks get their own session — can't share a bidi stream, + // and forks have different conversation tails anyway. + grpc_session: std::sync::Arc::new(crate::Mutex::new(None)), state: crate::Mutex::new(AgentState { tools, mcp_tools: McpToolAccess::None, @@ -406,7 +419,8 @@ impl Agent { let (rx, _stream_guard) = { let (prompt_tokens, images) = agent.assemble_prompt().await; let st = agent.state.lock().await; - agent.client.stream_completion_mm( + agent.client.stream_session_mm( + agent.grpc_session.clone(), &prompt_tokens, &images, api::SamplingParams { @@ -427,7 +441,8 @@ impl Agent { idx }; - let parser = ResponseParser::new(branch_idx); + let think_native = agent.state.lock().await.think_native; + let parser = ResponseParser::new(branch_idx, think_native); let (mut tool_rx, parser_handle) = parser.run(rx, agent.clone()); let mut pending_calls: Vec = Vec::new(); diff --git a/src/agent/salience.rs b/src/agent/salience.rs new file mode 100644 index 0000000..8cecd50 --- /dev/null +++ b/src/agent/salience.rs @@ -0,0 +1,309 @@ +// agent/salience.rs — peak extraction from per-token concept-readout traces. +// +// Consumes a trace of `ReadoutEntry` (per-token per-layer per-concept +// projections streamed from the vLLM server) and produces a compact +// list of `SaliencePeak` events — one per contiguous above-threshold +// region per concept, placed at the local maximum. +// +// Pure function. No I/O, no async, no side effects. Caller supplies the +// trace slice and manifest; caller decides what to do with the events. +// +// See also: `salience-trace-plumbing-architecture` memory node. + +use super::api::ReadoutManifest; +use super::readout::ReadoutEntry; + +/// One salient moment in a trace — a concept channel crossed threshold, +/// and we picked the local maximum within the contiguous above-threshold +/// run. +#[derive(Debug, Clone, PartialEq)] +pub struct SaliencePeak { + /// Index into the trace (0-based) where the peak occurred. + pub token_offset: usize, + /// Concept name from the manifest. + pub concept: String, + /// z-score of the peak value vs the trace's own distribution for + /// that concept. Always positive (we only pick above-threshold). + pub intensity: f32, +} + +/// Tunables for peak extraction. +#[derive(Debug, Clone)] +pub struct PeakConfig { + /// Minimum z-score to count as a peak. Default 2.0 (~top 2.5% assuming + /// normal-ish distribution, though readouts are rarely normal). + pub sigma_threshold: f32, + /// Minimum standard deviation of a concept channel for peaks to be + /// reported. If a channel is numerically flat across the whole trace, + /// tiny fluctuations can produce spurious "peaks" with huge z-scores; + /// require at least this much variation before trusting the channel. + pub min_std: f32, +} + +impl Default for PeakConfig { + fn default() -> Self { + Self { sigma_threshold: 2.0, min_std: 1e-4 } + } +} + +/// Extract peak events from a trace for one layer. +/// +/// `layer_idx` indexes into the per-token readout tensor's layer +/// dimension. If the trace is empty, the layer is out of range for any +/// entry, or the manifest is empty, returns `Vec::new()`. +/// +/// Peaks are returned sorted by `token_offset` ascending. When two +/// peaks share an offset they're ordered by `concept` lexicographically +/// for determinism. +pub fn pick_peaks( + trace: &[ReadoutEntry], + manifest: &ReadoutManifest, + layer_idx: usize, + config: &PeakConfig, +) -> Vec { + if trace.is_empty() || manifest.concepts.is_empty() { + return Vec::new(); + } + + let n_concepts = manifest.concepts.len(); + let n_tokens = trace.len(); + + // Pull a [n_tokens × n_concepts] column-major view for the selected + // layer. Entries where the layer is missing or the concept count + // doesn't match the manifest are treated as zeros — the downstream + // z-score will drown them as baseline if they're sparse, and if they + // dominate the caller has bigger problems. + let mut by_concept: Vec> = vec![Vec::with_capacity(n_tokens); n_concepts]; + for entry in trace { + match entry.readout.get(layer_idx) { + Some(row) if row.len() == n_concepts => { + for (c, v) in row.iter().enumerate() { + by_concept[c].push(*v); + } + } + _ => { + for col in by_concept.iter_mut() { + col.push(0.0); + } + } + } + } + + let mut peaks: Vec = Vec::new(); + for (c_idx, values) in by_concept.iter().enumerate() { + let (mean, std) = mean_std(values); + if std < config.min_std { + continue; + } + let concept = &manifest.concepts[c_idx]; + + // Walk contiguous above-threshold runs, emit one peak per run + // at the local max. + let mut run_start: Option = None; + let mut run_max_offset: usize = 0; + let mut run_max_z: f32 = 0.0; + for (i, v) in values.iter().enumerate() { + let z = (*v - mean) / std; + let above = z >= config.sigma_threshold; + if above { + if run_start.is_none() { + run_start = Some(i); + run_max_offset = i; + run_max_z = z; + } else if z > run_max_z { + run_max_offset = i; + run_max_z = z; + } + } else if run_start.is_some() { + peaks.push(SaliencePeak { + token_offset: run_max_offset, + concept: concept.clone(), + intensity: run_max_z, + }); + run_start = None; + } + } + // Flush trailing run. + if run_start.is_some() { + peaks.push(SaliencePeak { + token_offset: run_max_offset, + concept: concept.clone(), + intensity: run_max_z, + }); + } + } + + peaks.sort_by(|a, b| a.token_offset.cmp(&b.token_offset).then_with(|| a.concept.cmp(&b.concept))); + peaks +} + +/// Mean and population std of a slice. Returns (0.0, 0.0) for empty input. +fn mean_std(xs: &[f32]) -> (f32, f32) { + if xs.is_empty() { + return (0.0, 0.0); + } + let n = xs.len() as f32; + let mean = xs.iter().sum::() / n; + let var = xs.iter().map(|x| (x - mean).powi(2)).sum::() / n; + (mean, var.sqrt()) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn manifest(concepts: &[&str], layers: &[u32]) -> ReadoutManifest { + ReadoutManifest { + concepts: concepts.iter().map(|s| s.to_string()).collect(), + layers: layers.to_vec(), + } + } + + /// Build a trace where all entries have one hooked layer and the + /// given per-token values for each concept. `values[t][c]` = value + /// at token t, concept c. + fn trace(values: &[Vec]) -> Vec { + values.iter().enumerate().map(|(i, row)| ReadoutEntry { + token_id: i as u32, + readout: vec![row.clone()], + }).collect() + } + + #[test] + fn empty_trace_returns_empty() { + let m = manifest(&["curious"], &[63]); + let peaks = pick_peaks(&[], &m, 0, &PeakConfig::default()); + assert!(peaks.is_empty()); + } + + #[test] + fn empty_manifest_returns_empty() { + let m = manifest(&[], &[63]); + let t = trace(&[vec![], vec![], vec![]]); + let peaks = pick_peaks(&t, &m, 0, &PeakConfig::default()); + assert!(peaks.is_empty()); + } + + #[test] + fn flat_channel_produces_no_peaks() { + let m = manifest(&["curious"], &[63]); + let t = trace(&[vec![1.0], vec![1.0], vec![1.0], vec![1.0], vec![1.0]]); + let peaks = pick_peaks(&t, &m, 0, &PeakConfig::default()); + assert!(peaks.is_empty(), "flat channel should produce no peaks, got {:?}", peaks); + } + + #[test] + fn single_spike_detected() { + // Ten baseline zeros with one 5.0 spike — that single token's + // z-score will easily exceed 2σ. + let m = manifest(&["curious"], &[63]); + let mut rows: Vec> = (0..10).map(|_| vec![0.0]).collect(); + rows[5] = vec![5.0]; + let peaks = pick_peaks(&trace(&rows), &m, 0, &PeakConfig::default()); + assert_eq!(peaks.len(), 1); + assert_eq!(peaks[0].concept, "curious"); + assert_eq!(peaks[0].token_offset, 5); + assert!(peaks[0].intensity >= 2.0); + } + + #[test] + fn contiguous_region_emits_one_peak_at_max() { + // Values 0, 0, 0, 2, 5, 3, 0, 0 — the 3-5-3 hump is one run; + // peak should land at offset 4 (the 5). + let m = manifest(&["aha"], &[63]); + let rows: Vec> = [0.0, 0.0, 0.0, 2.0, 5.0, 3.0, 0.0, 0.0] + .iter().map(|v| vec![*v]).collect(); + let peaks = pick_peaks(&trace(&rows), &m, 0, &PeakConfig::default()); + assert_eq!(peaks.len(), 1, "expected one peak for one contiguous run, got {:?}", peaks); + assert_eq!(peaks[0].token_offset, 4); + } + + #[test] + fn multiple_concepts_independent() { + let m = manifest(&["curious", "aha"], &[63]); + // curious spikes at 2, aha spikes at 7 + let rows: Vec> = (0..10).map(|i| { + let c = if i == 2 { 4.0 } else { 0.0 }; + let a = if i == 7 { 4.0 } else { 0.0 }; + vec![c, a] + }).collect(); + let peaks = pick_peaks(&trace(&rows), &m, 0, &PeakConfig::default()); + assert_eq!(peaks.len(), 2); + // Sorted by offset — curious(2) comes first, aha(7) second. + assert_eq!(peaks[0].concept, "curious"); + assert_eq!(peaks[0].token_offset, 2); + assert_eq!(peaks[1].concept, "aha"); + assert_eq!(peaks[1].token_offset, 7); + } + + #[test] + fn two_separated_runs_emit_two_peaks() { + // Longer baseline so the two spikes don't dominate the global + // mean/std — 30 tokens of zeros with two 5.0 spikes at 10 and 20. + let m = manifest(&["curious"], &[63]); + let mut rows: Vec> = (0..30).map(|_| vec![0.0]).collect(); + rows[10] = vec![5.0]; + rows[20] = vec![5.0]; + let peaks = pick_peaks(&trace(&rows), &m, 0, &PeakConfig::default()); + assert_eq!(peaks.len(), 2, "expected two peaks for two runs, got {:?}", peaks); + assert_eq!(peaks[0].token_offset, 10); + assert_eq!(peaks[1].token_offset, 20); + } + + #[test] + fn trailing_run_is_flushed() { + // Peak runs to the end of the trace — must still emit. + // Use a longer baseline so the trailing spike is genuinely + // above threshold on the global stats. + let m = manifest(&["curious"], &[63]); + let mut rows: Vec> = (0..30).map(|_| vec![0.0]).collect(); + rows[27] = vec![3.0]; + rows[28] = vec![5.0]; + rows[29] = vec![4.0]; + let peaks = pick_peaks(&trace(&rows), &m, 0, &PeakConfig::default()); + assert_eq!(peaks.len(), 1, "expected one peak for one trailing run, got {:?}", peaks); + assert_eq!(peaks[0].token_offset, 28, "peak should land at the local max of the trailing run"); + } + + #[test] + fn sub_threshold_produces_nothing() { + // All non-zero values are small; z-scores won't cross 2σ. + let m = manifest(&["curious"], &[63]); + let rows: Vec> = [0.0, 0.1, 0.0, 0.1, 0.0, 0.1, 0.0, 0.1] + .iter().map(|v| vec![*v]).collect(); + let peaks = pick_peaks(&trace(&rows), &m, 0, &PeakConfig::default()); + assert!(peaks.is_empty(), "below-threshold wiggle should produce no peaks, got {:?}", peaks); + } + + #[test] + fn layer_out_of_range_returns_empty() { + let m = manifest(&["curious"], &[63]); + let rows: Vec> = (0..10).map(|i| vec![if i == 5 { 5.0 } else { 0.0 }]).collect(); + // Trace has one layer (index 0); asking for layer 3 should see + // all-zero columns, which are flat and produce no peaks. + let peaks = pick_peaks(&trace(&rows), &m, 3, &PeakConfig::default()); + assert!(peaks.is_empty()); + } + + #[test] + fn manifest_concept_count_mismatch_is_safe() { + // Manifest says 2 concepts; each readout row only has 1 value. + // Rows should be treated as all-zero (via the len check) and + // produce no peaks without panicking. + let m = manifest(&["a", "b"], &[63]); + let rows: Vec> = (0..10).map(|_| vec![1.0]).collect(); + let peaks = pick_peaks(&trace(&rows), &m, 0, &PeakConfig::default()); + assert!(peaks.is_empty()); + } + + #[test] + fn threshold_tunable() { + // Same spike, stricter threshold — no peak. + let m = manifest(&["curious"], &[63]); + let mut rows: Vec> = (0..10).map(|_| vec![0.0]).collect(); + rows[5] = vec![5.0]; + let strict = PeakConfig { sigma_threshold: 100.0, ..PeakConfig::default() }; + let peaks = pick_peaks(&trace(&rows), &m, 0, &strict); + assert!(peaks.is_empty()); + } +} diff --git a/src/agent/tools/vision.rs b/src/agent/tools/vision.rs index 0e36888..d122384 100644 --- a/src/agent/tools/vision.rs +++ b/src/agent/tools/vision.rs @@ -57,15 +57,18 @@ async fn view_image( let (w, h) = (dim.width as u32, dim.height as u32); let mime = mime_from_extension(path); - let image_leaf = AstNode::image(bytes.clone(), mime, h, w); - let token_count = image_leaf.leaf().unwrap().tokens().saturating_sub(2); - let agent = agent.context("view_image requires agent context")?; + + // token_count is populated when the image reaches the server via + // AppendImage (the server is authoritative for the IMAGE_PAD + // count). Placeholder of 0 here until AppendImage is wired; the + // leaf's count gets rewritten from the RPC response at send time. + let image_leaf = AstNode::image(bytes.clone(), mime, h, w, 0); + let branch = AstNode::branch(Role::User, vec![image_leaf]); agent.context.lock().await.push_log(Section::Conversation, branch); - Ok(format!("loaded {} ({}, {}x{}, {} tokens)", - a.file_path, mime, w, h, token_count)) + Ok(format!("loaded {} ({}, {}x{})", a.file_path, mime, w, h)) } fn mime_from_extension(path: &std::path::Path) -> &'static str { diff --git a/src/lib.rs b/src/lib.rs index e6411e3..ccb4333 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,6 +25,9 @@ macro_rules! dbglog { }}; } +// Logging (target-routed file logger) +pub mod logging; + // User interface (TUI, CLI) pub mod user; diff --git a/src/logging.rs b/src/logging.rs new file mode 100644 index 0000000..3c9d080 --- /dev/null +++ b/src/logging.rs @@ -0,0 +1,146 @@ +// logging.rs — log-crate logger that routes by target. +// +// Records with target "grpc" (or any target starting with "grpc::") go +// to ~/.consciousness/logs/daemon/grpc.log so we can tell gRPC events +// apart from the rest of consciousness's noise. Everything else goes +// to ~/.consciousness/logs/daemon/debug.log. +// +// Level threshold is taken from RUST_LOG (simple global level parse: +// "trace"/"debug"/"info"/"warn"/"error"); defaults to "info". + +use std::io::Write; +use std::path::PathBuf; +use std::sync::Mutex; + +use log::{Level, LevelFilter, Log, Metadata, Record, SetLoggerError}; + +fn logs_dir() -> PathBuf { + dirs::home_dir().unwrap_or_default().join(".consciousness/logs/daemon") +} + +struct RoutingLogger { + grpc_file: Mutex>, + debug_file: Mutex>, + level: LevelFilter, +} + +impl RoutingLogger { + fn new(level: LevelFilter) -> Self { + let dir = logs_dir(); + let _ = std::fs::create_dir_all(&dir); + let grpc = std::fs::OpenOptions::new() + .create(true).append(true) + .open(dir.join("grpc.log")).ok(); + let debug = std::fs::OpenOptions::new() + .create(true).append(true) + .open(dir.join("debug.log")).ok(); + Self { + grpc_file: Mutex::new(grpc), + debug_file: Mutex::new(debug), + level, + } + } + + fn is_grpc_target(target: &str) -> bool { + target == "grpc" || target.starts_with("grpc::") + } +} + +impl Log for RoutingLogger { + fn enabled(&self, m: &Metadata) -> bool { + // Always enable DEBUG for grpc target so the dedicated log is + // actually useful without RUST_LOG wrangling; defer to the + // configured level for everything else. + if Self::is_grpc_target(m.target()) { + return m.level() <= Level::Debug; + } + m.level() <= self.level + } + + fn log(&self, record: &Record) { + if !self.enabled(record.metadata()) { + return; + } + let line = format!( + "[{}] [{}] [{}] {}\n", + chrono::Utc::now().format("%Y-%m-%d %H:%M:%S%.3f"), + record.level(), + record.target(), + record.args(), + ); + let slot = if Self::is_grpc_target(record.target()) { + &self.grpc_file + } else { + &self.debug_file + }; + if let Ok(mut guard) = slot.lock() { + if let Some(ref mut f) = *guard { + let _ = f.write_all(line.as_bytes()); + } + } + } + + fn flush(&self) { + for slot in [&self.grpc_file, &self.debug_file] { + if let Ok(mut g) = slot.lock() { + if let Some(ref mut f) = *g { + let _ = f.flush(); + } + } + } + } +} + +fn parse_level_from_env() -> LevelFilter { + let raw = std::env::var("RUST_LOG").unwrap_or_else(|_| "info".to_string()); + // Parse a plain level word; if it's the module=level form, we take + // the first level we find. + let token = raw.split(',').next().unwrap_or("info"); + let level_word = token.rsplit_once('=').map(|(_, v)| v).unwrap_or(token); + match level_word.trim().to_lowercase().as_str() { + "trace" => LevelFilter::Trace, + "debug" => LevelFilter::Debug, + "info" => LevelFilter::Info, + "warn" => LevelFilter::Warn, + "error" => LevelFilter::Error, + "off" => LevelFilter::Off, + _ => LevelFilter::Info, + } +} + +/// Install the routing logger. Safe to call at most once — subsequent +/// calls return an error but are otherwise no-ops. +pub fn init() -> Result<(), SetLoggerError> { + let level = parse_level_from_env(); + let logger = Box::new(RoutingLogger::new(level)); + log::set_boxed_logger(logger)?; + // Always let DEBUG records through globally so the grpc log can + // capture them (the logger itself filters non-grpc targets by + // `level`). The cost is that log::debug! call-sites below `level` + // in other modules still do their arg formatting before being + // dropped at the logger; acceptable for a debug tool. + log::set_max_level(LevelFilter::Debug.max(level)); + // Mark the file with a session boundary so it's easy to see where a + // restart happened. + log::info!( + "===== consciousness logger init (level={}, pid={}) =====", + level, std::process::id(), + ); + log::info!(target: "grpc", + "===== grpc log init (level={}, pid={}) =====", + level, std::process::id(), + ); + Ok(()) +} + +/// Consumer of &Level so the type is used when only some callers want it. +#[allow(dead_code)] +pub fn current_level() -> Level { + match log::max_level() { + LevelFilter::Trace => Level::Trace, + LevelFilter::Debug => Level::Debug, + LevelFilter::Info | LevelFilter::Off => Level::Info, + LevelFilter::Warn => Level::Warn, + LevelFilter::Error => Level::Error, + } +} diff --git a/src/subconscious/generate.rs b/src/subconscious/generate.rs index 8d75f1b..757e08a 100644 --- a/src/subconscious/generate.rs +++ b/src/subconscious/generate.rs @@ -4,6 +4,8 @@ // given a context prefix and a skip predicate, generate what the model // would say as the next assistant turn. +use std::sync::Arc; + use crate::agent::api::{ApiClient, SamplingParams, StreamToken}; use crate::agent::context::{AstNode, ContextState}; use crate::agent::tokenizer; @@ -13,6 +15,9 @@ use crate::agent::tokenizer; /// assembly. The model is whichever `client` points at — the default /// runtime client for memory-ablation alternates, a test-model client /// for F7 comparison. +/// +/// Uses a fresh ephemeral gRPC session (no cross-call KV reuse): one +/// Open / Append / Generate round-trip, then the session is dropped. pub async fn gen_continuation( context: &ContextState, entry_idx: usize, @@ -31,7 +36,13 @@ where F: FnMut(&AstNode) -> bool, top_p: 0.95, top_k: 20, }; - let (mut rx, _guard) = client.stream_completion_mm(&prompt, &images, sampling, Some(-5)); + + // Ephemeral per-call session — opens on first touch, drops when + // `_guard` drops at function end. + let session_lock = Arc::new(crate::Mutex::new(None)); + let (mut rx, _guard) = client.stream_session_mm( + session_lock, &prompt, &images, sampling, Some(-5), + ); let mut tokens = Vec::new(); while let Some(tok) = rx.recv().await { diff --git a/src/user/mod.rs b/src/user/mod.rs index 04e895b..80754a1 100644 --- a/src/user/mod.rs +++ b/src/user/mod.rs @@ -756,6 +756,11 @@ fn restore_stderr(original_fd: std::os::fd::RawFd) { #[tokio::main] pub async fn main() { + // Install target-routed file logger: `target: "grpc"` records go to + // ~/.consciousness/logs/daemon/grpc.log, everything else to debug.log. + // Level from RUST_LOG, defaulting to info. + let _ = crate::logging::init(); + // Reap channel-daemon zombies via a SIGCHLD handler that only touches // PIDs listed in channels_dir(). Avoids SIGCHLD=SIG_IGN, which would // break tokio::process::Command::wait() (kernel auto-reap → ECHILD). From 8d9c9e9f7b462217487f26997ebeb2abed85769d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Apr 2026 12:27:55 -0400 Subject: [PATCH 02/31] agent: end-to-end gRPC Generate with delta-based session orchestration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires the client side of the new salience protocol so inference actually runs over gRPC instead of emitting the stubbed "not yet wired" error. Each turn walks the AST as interleaved chunks, sends only what's new to the server, and streams decode tokens back. context.rs: * `WireChunk` enum: `Tokens(Vec)` or `Image { bytes, mime, known_expanded_len }`. Preserves text/image/text ordering the wire path can't flatten. * `wire_chunks(range, skip)` walker, parallel to `wire_prompt` — branches emit `<|im_start|>…<|im_end|>` tokens, image leaves emit a single Image chunk (no inline vision tokens). * `NodeLeaf::set_image_token_count(n)` + recompute of cached `token_ids`; `ContextState::commit_image_token_counts(&[u32])` fills in the first-N zero-count image leaves in wire order. * `ResponseParser::run` handles the new `StreamToken::ImageAppended` by committing the server's N into the AST before the final Generate's Token events stream in. salience.rs: * `SessionHandle` tracks `committed_len`. `append_image` advances it from the RPC response. New `generate(req)` opens the server-streaming RPC. api/mod.rs: * `stream_session_mm(session_lock, chunks, sampling, priority, readout_shape)` replaces the stub. Spawns `run_session_generate`. * `run_session_generate`: takes the session out of the Mutex (or opens fresh), skips chunks covered by `committed_len` (bails on mid-chunk straddle or unknown-length image in the committed prefix), walks the delta: accumulates Tokens into `pending`, on Image flushes pending via `flush_pending` (max_tokens=0 Generate that just prefills), then AppendImage + emits StreamToken::ImageAppended. Final Generate carries any trailing pending text as `append_tokens` and the sampling params; Token events stream out as StreamToken::Token, Done as StreamToken::Done. On success, handle with updated `committed_len` returns to the Mutex; on error, handle drops and next call reopens. * `StreamToken::ImageAppended { placeholder_count }` variant — emitted in wire order before the final Generate's tokens. * Prefix-cache cap for readout coverage: `readout_ranges` covers `[prompt_len_after_append, u32::MAX)` when the caller provides a readout_shape, so decode positions stream their readouts. agent/mod.rs: * `assemble_prompt` returns `Vec` with the assistant prologue merged into the trailing Tokens chunk. Caller in `turn` passes chunks + readout_shape (pulled from `agent.readout.lock().manifest`) to `stream_session_mm`. * Dropped `assemble_prompt_tokens` — dead. mind + unconscious: * `Unconscious::new(client)` stores a shared `ApiClient`. Fixes the repeated-manifest-fetch bug caused by each subagent's `ApiClient::new` having its own OnceCell. The client's Arc- wrapped manifest cache is now shared across every agent Mind spawns. * `prepare_spawn(name, auto, wake, base_client)` clones the base client and overrides `.model` for the resolved backend instead of constructing fresh. All three callers (`toggle`/`trigger`/unconscious loop) pass `self.client.clone()`. * `Mind::new` passes `agent.client.clone()` into `Unconscious::new`. subconscious/generate.rs: * gen_continuation switched to `wire_chunks` + the new `stream_session_mm` signature. Ephemeral session opens on each call, tears down at scope end. No readouts requested. Not changed yet, noted for follow-up: * Subconscious ablation scoring in learn.rs still talks to `/v1/score` over HTTP. Will migrate once we have time to verify the Generate+max_tokens=0+prompt_logprobs path end-to-end. * compare.rs constructs its own ApiClient for the `compare.test_backend` (which is intentionally a different endpoint) — left alone. * Readout manifest still fetched via HTTP at Agent::new. Migration to GetReadoutManifest gRPC is a separate cleanup. Co-Authored-By: Proof of Concept --- src/agent/api/mod.rs | 303 +++++++++++++++++++++++++++++++++-- src/agent/api/salience.rs | 38 ++++- src/agent/context.rs | 145 +++++++++++++++++ src/agent/mod.rs | 50 +++--- src/mind/mod.rs | 9 +- src/mind/unconscious.rs | 27 +++- src/subconscious/generate.rs | 24 ++- 7 files changed, 536 insertions(+), 60 deletions(-) diff --git a/src/agent/api/mod.rs b/src/agent/api/mod.rs index 06ecf70..a7a87f7 100644 --- a/src/agent/api/mod.rs +++ b/src/agent/api/mod.rs @@ -49,7 +49,6 @@ impl Drop for AbortOnDrop { /// Sampling parameters for model generation. #[derive(Clone, Copy)] -#[allow(dead_code)] // fields used once Generate RPC lands in a later step pub(crate) struct SamplingParams { pub temperature: f32, pub top_p: f32, @@ -66,6 +65,12 @@ pub enum StreamToken { /// `readout` is `None` when the server has readout disabled or /// returned no readout for this chunk. Token { id: u32, readout: Option }, + /// An image was committed server-side via AppendImage during this + /// stream. `placeholder_count` is the N IMAGE_PADs the server + /// wrote. Emitted in AST order — caller applies these counts to + /// the first-N image leaves that currently have token_count=0 + /// via `ContextState::commit_image_token_counts`. + ImageAppended { placeholder_count: u32 }, Done { usage: Option }, Error(String), } @@ -98,26 +103,41 @@ impl ApiClient { } } - /// Stream generation via a gRPC session. Stubbed during the - /// unary-rewrite transition — the Generate RPC is wired in a - /// later step of this series. Until then, callers that reach - /// this path get a StreamToken::Error. + /// Stream generation via a gRPC session. Walks the prompt chunks + /// comparing against the session's `committed_len`, sends the + /// delta as interleaved `AppendImage` + intermediate + /// `Generate(max_tokens=0)` (for text runs separating images) + + /// a final `Generate(max_tokens=sampling.max_tokens, ...)` whose + /// Token events stream back through the channel. + /// + /// On any gRPC error the session is dropped; the next call + /// reopens fresh. Happy-path ordering: Token* Done. Error paths + /// emit `StreamToken::Error` and close. pub(crate) fn stream_session_mm( &self, - _session_lock: std::sync::Arc>>, - _prompt_tokens: &[u32], - _images: &[super::context::WireImage], - _sampling: SamplingParams, - _priority: Option, + session_lock: std::sync::Arc>>, + chunks: Vec, + sampling: SamplingParams, + priority: Option, + readout_shape: Option<(u32, u32)>, ) -> (mpsc::UnboundedReceiver, AbortOnDrop) { let (tx, rx) = mpsc::unbounded_channel(); + let base_url = self.base_url.clone(); + let api_key = self.api_key.clone(); + let model = self.model.clone(); + let handle = tokio::spawn(async move { - let _ = tx.send(StreamToken::Error( - "Generate RPC not yet wired after protocol rewrite — see \ - proto/salience.proto; AppendImage / Generate land next." - .into(), - )); + let result = run_session_generate( + session_lock, &base_url, &api_key, &model, + chunks, sampling, priority, readout_shape, &tx, + ).await; + if let Err(e) = result { + log::warn!(target: "grpc", + "stream_session_mm error, forwarding to UI: {:#}", e); + let _ = tx.send(StreamToken::Error(format!("{:#}", e))); + } }); + (rx, AbortOnDrop(handle)) } @@ -131,6 +151,8 @@ impl ApiClient { /// First call performs the HTTP fetch; subsequent calls (including /// across ApiClient clones sharing the same cell) return the /// cached result. The manifest doesn't change during a server run. + pub fn model_str(&self) -> &str { &self.model } + pub async fn fetch_readout_manifest(&self) -> Result> { let manifest = self.manifest.get_or_try_init(|| async { let url = format!("{}/readout/manifest", self.base_url); @@ -156,3 +178,254 @@ impl ApiClient { } +/// Body of the gRPC-path streaming task. Walks the wire chunks +/// against the session's `committed_len`, sends the delta via +/// AppendImage / intermediate prefill-only Generates / final decode +/// Generate, and translates the final Generate's Token events into +/// StreamTokens on `tx`. On success the session handle is returned +/// to `session_lock` with an updated `committed_len`; on error the +/// handle is dropped so the next call reopens. +async fn run_session_generate( + session_lock: std::sync::Arc>>, + base_url: &str, + api_key: &str, + model: &str, + chunks: Vec, + sampling: SamplingParams, + priority: Option, + readout_shape: Option<(u32, u32)>, + tx: &mpsc::UnboundedSender, +) -> Result<()> { + use std::time::Instant; + use futures::StreamExt; + use super::context::WireChunk; + use salience::pb; + + let mut handle: salience::SessionHandle = { + let mut guard = session_lock.lock().await; + match guard.take() { + Some(h) => h, + None => { + drop(guard); + log::debug!(target: "grpc", "run_session_generate: opening new session"); + salience::SessionHandle::open(base_url, api_key, model).await? + } + } + }; + + // Skip chunks already on the server. committed_len must land on + // a chunk boundary — every successful AppendImage / Generate + // advances committed_len by exactly one chunk's contribution, + // so straddling means divergence (client's AST was rewritten + // under us). + let mut acc: u32 = 0; + let mut delta_start = chunks.len(); + for (i, chunk) in chunks.iter().enumerate() { + if acc == handle.committed_len { + delta_start = i; + break; + } + let len = match chunk { + WireChunk::Tokens(t) => t.len() as u32, + WireChunk::Image { known_expanded_len, .. } => *known_expanded_len, + }; + if len == 0 { + anyhow::bail!( + "session divergence: chunk {} has unknown length but \ + precedes committed_len {} (acc={})", + i, handle.committed_len, acc, + ); + } + if acc + len > handle.committed_len { + anyhow::bail!( + "session divergence: chunk {} straddles committed_len \ + (acc={}, len={}, committed={})", + i, acc, len, handle.committed_len, + ); + } + acc += len; + } + if acc != handle.committed_len { + anyhow::bail!( + "session divergence: chunks sum to {} but committed_len is {}", + acc, handle.committed_len, + ); + } + + // Walk the delta: accumulate Tokens in `pending`; on Image, + // flush pending via prefill-only Generate then AppendImage. + let mut pending: Vec = Vec::new(); + for chunk in &chunks[delta_start..] { + match chunk { + WireChunk::Tokens(t) => pending.extend_from_slice(t), + WireChunk::Image { bytes, mime, .. } => { + if !pending.is_empty() { + flush_pending(&mut handle, std::mem::take(&mut pending)).await?; + } + let resp = handle + .append_image(bytes.clone(), mime.clone(), false) + .await?; + log::debug!(target: "grpc", + "AppendImage: N={} total_length={}", + resp.placeholder_count, resp.total_length); + let _ = tx.send(StreamToken::ImageAppended { + placeholder_count: resp.placeholder_count, + }); + } + } + } + + // Final Generate: pending holds any trailing text; decode up to + // sampling.max_tokens. Request readouts on all decode positions + // via a catch-all range ending at u32::MAX — decode never + // reaches it. + let prompt_len_after_append = handle.committed_len + pending.len() as u32; + let readout_ranges = if readout_shape.is_some() { + vec![pb::PositionRange { + start: prompt_len_after_append, + end: u32::MAX, + }] + } else { + Vec::new() + }; + let max_tokens = sampling_max_tokens(&sampling); + let req = pb::GenerateRequest { + session_id: handle.session_id.clone(), + append_tokens: pending, + offset: handle.committed_len, + truncating: false, + max_tokens, + logprobs_ranges: Vec::new(), + logprob_top_k: 0, + readout_ranges, + temperature: sampling.temperature, + top_p: sampling.top_p, + top_k: sampling.top_k, + stop_token_ids: Vec::new(), + priority: priority.unwrap_or(0), + }; + let session_id_for_log = handle.session_id.clone(); + let t_generate = Instant::now(); + log::debug!(target: "grpc", + "session {} Generate: offset={} append={} max_tokens={} priority={}", + session_id_for_log, req.offset, req.append_tokens.len(), + req.max_tokens, req.priority); + + let mut stream = handle.generate(req).await?; + let (n_layers, n_concepts) = readout_shape.unwrap_or((0, 0)); + let mut session_terminated = false; + let mut first_token_at: Option = None; + + while let Some(event) = stream.next().await { + let event = match event { + Ok(e) => e, + Err(status) => { + log::warn!(target: "grpc", + "session {} Generate stream error: {} — dropping session", + session_id_for_log, status); + session_terminated = true; + let _ = tx.send(StreamToken::Error(format!( + "Generate stream error: {}", status, + ))); + break; + } + }; + let Some(inner) = event.event else { continue }; + match inner { + pb::generate_event::Event::Token(t) => { + if t.is_prefill { continue; } + if first_token_at.is_none() { + log::debug!(target: "grpc", + "session {} first decode token at {:?}", + session_id_for_log, t_generate.elapsed()); + first_token_at = Some(Instant::now()); + } + let readout = if t.readout.is_empty() { + None + } else if n_layers == 0 || n_concepts == 0 { + None + } else { + let expected = (n_layers as usize) * (n_concepts as usize); + if t.readout.len() != expected { + log::warn!(target: "grpc", + "readout shape mismatch: expected {}*{}={}, got {}", + n_layers, n_concepts, expected, t.readout.len()); + None + } else { + let n = n_concepts as usize; + let mut layers: Vec> = Vec::with_capacity(n_layers as usize); + for l in 0..(n_layers as usize) { + layers.push(t.readout[l * n..(l + 1) * n].to_vec()); + } + Some(layers) + } + }; + if tx.send(StreamToken::Token { id: t.id, readout }).is_err() { + break; + } + } + pb::generate_event::Event::Done(d) => { + log::debug!(target: "grpc", + "session {} Done: prompt={} completion={} total={} reason={:?} elapsed={:?}", + session_id_for_log, d.prompt_tokens, d.completion_tokens, + d.total_tokens, d.finish_reason, t_generate.elapsed()); + handle.committed_len = d.total_tokens; + let usage = Some(Usage { + prompt_tokens: d.prompt_tokens, + completion_tokens: d.completion_tokens, + total_tokens: d.total_tokens, + }); + let _ = tx.send(StreamToken::Done { usage }); + } + } + } + + if !session_terminated { + let mut guard = session_lock.lock().await; + *guard = Some(handle); + } + Ok(()) +} + +/// Emit a prefill-only Generate for the pending token run. Used to +/// append text that separates two image blocks — the server needs +/// those tokens in its session before we AppendImage the next image, +/// but we don't want the cost or output of a decode step. +async fn flush_pending( + handle: &mut salience::SessionHandle, + tokens: Vec, +) -> Result<()> { + use futures::StreamExt; + use salience::pb; + let req = pb::GenerateRequest { + session_id: handle.session_id.clone(), + append_tokens: tokens, + offset: handle.committed_len, + truncating: false, + max_tokens: 0, + logprobs_ranges: Vec::new(), + logprob_top_k: 0, + readout_ranges: Vec::new(), + temperature: 0.0, + top_p: 0.0, + top_k: 0, + stop_token_ids: Vec::new(), + priority: 0, + }; + let mut stream = handle.generate(req).await?; + while let Some(event) = stream.next().await { + let event = event.map_err(|s| anyhow::anyhow!("flush Generate stream: {}", s))?; + if let Some(pb::generate_event::Event::Done(d)) = event.event { + handle.committed_len = d.total_tokens; + } + } + Ok(()) +} + +fn sampling_max_tokens(_sampling: &SamplingParams) -> u32 { + // SamplingParams doesn't carry max_tokens today; 4096 mirrors + // the old server-side default and is a sensible interactive cap. + // TODO: plumb from the caller if we need bigger budgets. + 4096 +} + diff --git a/src/agent/api/salience.rs b/src/agent/api/salience.rs index f9ea83d..18f0d7b 100644 --- a/src/agent/api/salience.rs +++ b/src/agent/api/salience.rs @@ -145,12 +145,14 @@ pub async fn append_image( /// Handle to a server-side session. Carries the id + connection params /// so subsequent per-session RPCs (AppendImage, Generate, ForkSession) /// can be issued without the caller juggling base_url / api_key each -/// time. +/// time. `committed_len` tracks the server's current session.tokens +/// length so the client can submit deltas with the right `offset`. pub struct SessionHandle { pub session_id: String, pub max_model_len: u32, pub base_url: String, pub api_key: String, + pub committed_len: u32, } impl SessionHandle { @@ -168,6 +170,7 @@ impl SessionHandle { max_model_len: resp.max_model_len, base_url: grpc_url, api_key: api_key.to_string(), + committed_len: 0, }) } @@ -175,25 +178,44 @@ impl SessionHandle { close_session(&self.base_url, &self.api_key, &self.session_id).await } - /// Append an image via the server-side vision block. See - /// `append_image` free function for full semantics. + /// Append an image via the server-side vision block. Updates + /// `committed_len` from the server's response on success. pub async fn append_image( - &self, + &mut self, data: Vec, mime: String, - offset: u32, truncating: bool, ) -> Result { - append_image( + let resp = append_image( &self.base_url, &self.api_key, &self.session_id, data, mime, - offset, + self.committed_len, truncating, ) - .await + .await?; + self.committed_len = resp.total_length; + Ok(resp) + } + + /// Open a gRPC Generate stream with the given request. Caller + /// iterates the returned stream of GenerateEvents; the handle's + /// `committed_len` is advanced on Done based on the Done event's + /// `total_tokens` field. + pub async fn generate( + &self, + req: pb::GenerateRequest, + ) -> Result> { + let mut client = connect(&self.base_url).await?; + let mut req = tonic::Request::new(req); + with_auth(&mut req, &self.api_key); + let resp = client + .generate(req) + .await + .with_context(|| "Generate RPC failed")?; + Ok(resp.into_inner()) } } diff --git a/src/agent/context.rs b/src/agent/context.rs index ab21e21..2982851 100644 --- a/src/agent/context.rs +++ b/src/agent/context.rs @@ -312,6 +312,16 @@ impl NodeLeaf { pub fn token_ids(&self) -> &[u32] { &self.token_ids } pub fn tokens(&self) -> usize { self.token_ids.len() } pub fn timestamp(&self) -> DateTime { self.timestamp } + + /// If this is an Image leaf, update its IMAGE_PAD count to `n` and + /// recompute cached `token_ids`. No-op on non-Image leaves — + /// callers know the body shape via `body()`. + pub fn set_image_token_count(&mut self, n: u32) { + if let NodeBody::Image { token_count, .. } = &mut self.body { + *token_count = n; + self.token_ids = self.body.compute_token_ids(); + } + } } impl AstNode { @@ -737,6 +747,15 @@ impl ResponseParser { parser.finish(&mut ctx); return Ok(()); } + super::api::StreamToken::ImageAppended { placeholder_count } => { + // Commit the server-authoritative IMAGE_PAD + // count into the first zero-count image leaf + // in wire order. AppendImage always runs + // before the final Generate, so this fires + // before any Token events for this stream. + let mut ctx = agent.context.lock().await; + ctx.commit_image_token_counts(&[placeholder_count]); + } super::api::StreamToken::Error(e) => { return Err(anyhow::anyhow!("{}", e)); } @@ -866,6 +885,36 @@ impl ContextState { pub fn sections(&self) -> [&Vec; 4] { [&self.system, &self.identity, &self.journal, &self.conversation] } + + /// Walk image leaves across all sections in wire order and fill in + /// the first N leaves that have `token_count == 0` with successive + /// values from `counts`. Used after a gRPC session's stream of + /// AppendImage responses to commit the server's IMAGE_PAD counts + /// back into the AST so the next wire walk doesn't see zero-count + /// images in the already-committed prefix. + pub fn commit_image_token_counts(&mut self, counts: &[u32]) { + fn visit(node: &mut AstNode, counts: &[u32], idx: &mut usize) { + if *idx >= counts.len() { return; } + match node { + AstNode::Leaf(leaf) => { + if let NodeBody::Image { token_count, .. } = leaf.body() { + if *token_count == 0 { + leaf.set_image_token_count(counts[*idx]); + *idx += 1; + } + } + } + AstNode::Branch { children, .. } => { + for c in children { visit(c, counts, idx); } + } + } + } + let mut idx = 0usize; + for node in &mut self.system { visit(node, counts, &mut idx); } + for node in &mut self.identity { visit(node, counts, &mut idx); } + for node in &mut self.journal { visit(node, counts, &mut idx); } + for node in &mut self.conversation { visit(node, counts, &mut idx); } + } } impl Ast for ContextState { @@ -909,6 +958,28 @@ pub struct WireImage { pub mime: String, } +/// One piece of the wire stream for the gRPC session path. Runs of +/// text/tool/thinking tokens are batched into `Tokens`; each Image +/// leaf becomes its own `Image` chunk because the server writes the +/// full vision block on AppendImage — the client never sends vision +/// tokens inline. Order matches the AST's depth-first wire order. +#[derive(Clone)] +pub enum WireChunk { + Tokens(Vec), + Image { + bytes: Vec, + mime: String, + /// Client's current best guess at how many tokens the server + /// will expand this image to, including bookends. `0` means + /// the count is unknown (view_image just loaded the image and + /// AppendImage hasn't run yet). Callers use this only to know + /// this chunk's contribution to the server-visible length for + /// offset bookkeeping on chunks that were already appended on + /// a prior turn. + known_expanded_len: u32, + }, +} + fn wire_into(node: &AstNode, tokens: &mut Vec, images: &mut Vec) { match node { AstNode::Leaf(leaf) => match leaf.body() { @@ -1045,6 +1116,80 @@ impl ContextState { } (tokens, images, assistant_ranges) } + + /// Build the wire stream as interleaved `WireChunk`s for the gRPC + /// session path. Unlike `wire_prompt`, this preserves the order + /// of text runs vs image blocks so the caller can drive the + /// append flow (AppendImage for each Image, Generate append for + /// contiguous text runs). + /// + /// `conv_range` and `skip` mirror `wire_prompt` — select a + /// conversation slice and drop identity / conversation nodes by + /// predicate. + pub fn wire_chunks( + &self, + conv_range: std::ops::Range, + mut skip: F, + ) -> Vec + where F: FnMut(&AstNode) -> bool, + { + let mut out: Vec = Vec::new(); + let mut buf: Vec = Vec::new(); + + fn flush(buf: &mut Vec, out: &mut Vec) { + if !buf.is_empty() { + out.push(WireChunk::Tokens(std::mem::take(buf))); + } + } + + fn visit(node: &AstNode, buf: &mut Vec, out: &mut Vec) { + match node { + AstNode::Leaf(leaf) => match leaf.body() { + NodeBody::Image { bytes, mime, token_count, .. } => { + flush(buf, out); + // Bookends (VISION_START + VISION_END) add 2 + // to the expanded length; token_count is the + // IMAGE_PAD run. 0 means count is still + // unknown (no AppendImage yet) — don't claim + // a length the server will disagree with. + let expanded = if *token_count == 0 { + 0 + } else { + *token_count + 2 + }; + out.push(WireChunk::Image { + bytes: bytes.clone(), + mime: mime.clone(), + known_expanded_len: expanded, + }); + } + _ => buf.extend_from_slice(leaf.token_ids()), + }, + AstNode::Branch { role, children, .. } => { + buf.push(tokenizer::IM_START); + buf.extend(tokenizer::encode(&format!("{}\n", role.as_str()))); + for c in children { + visit(c, buf, out); + } + buf.push(tokenizer::IM_END); + buf.extend(tokenizer::encode("\n")); + } + } + } + + for node in self.system() { visit(node, &mut buf, &mut out); } + for node in self.identity() { + if skip(node) { continue; } + visit(node, &mut buf, &mut out); + } + for node in self.journal() { visit(node, &mut buf, &mut out); } + for node in &self.conversation()[conv_range] { + if skip(node) { continue; } + visit(node, &mut buf, &mut out); + } + flush(&mut buf, &mut out); + out + } } impl ContextState { diff --git a/src/agent/mod.rs b/src/agent/mod.rs index 6a55f3f..a8e7592 100644 --- a/src/agent/mod.rs +++ b/src/agent/mod.rs @@ -329,35 +329,32 @@ impl Agent { }) } - pub async fn assemble_prompt_tokens(&self) -> Vec { - self.assemble_prompt().await.0 - } - - /// Assemble a ready-to-send prompt: token stream in wire form (each - /// image collapsed to a single `<|image_pad|>`) paired with the - /// images to attach as multi_modal_data. - /// - /// Pre-send size check: if the context has grown past budget since the - /// last compact (accumulation between turns, a fork's context getting - /// bigger than expected, etc.), trim here rather than letting vLLM - /// reject the request. Client-side tokenization means we already know - /// the exact token count so there's no reason to round-trip an - /// oversize request. - pub async fn assemble_prompt(&self) -> (Vec, Vec) { + /// Assemble a ready-to-send prompt as interleaved wire chunks for + /// the gRPC session path. Text runs are batched; each Image leaf + /// becomes its own chunk. Also trims the conversation to budget + /// first so we don't build a prompt the server will reject for + /// length. + pub async fn assemble_prompt(&self) -> Vec { let mut ctx = self.context.lock().await; if ctx.total_tokens() > context::context_budget_tokens() { ctx.trim_conversation(); } let st = self.state.lock().await; - let (mut tokens, images, _) = - ctx.wire_prompt(0..ctx.conversation().len(), |_| false); - tokens.push(tokenizer::IM_START); + let conv_len = ctx.conversation().len(); + let mut chunks = ctx.wire_chunks(0..conv_len, |_| false); + // Assistant-turn prologue. Merge into the trailing Tokens + // chunk if there is one, else push as a new chunk. + let mut prologue = vec![tokenizer::IM_START]; if st.think_native { - tokens.extend(tokenizer::encode("assistant\n\n")); + prologue.extend(tokenizer::encode("assistant\n\n")); } else { - tokens.extend(tokenizer::encode("assistant\n")); + prologue.extend(tokenizer::encode("assistant\n")); } - (tokens, images) + match chunks.last_mut() { + Some(context::WireChunk::Tokens(last)) => last.extend(prologue), + _ => chunks.push(context::WireChunk::Tokens(prologue)), + } + chunks } /// Rebuild the tools section of the system prompt from the current tools list. @@ -417,18 +414,23 @@ impl Agent { let _thinking = start_activity(&agent, "thinking...").await; let (rx, _stream_guard) = { - let (prompt_tokens, images) = agent.assemble_prompt().await; + let chunks = agent.assemble_prompt().await; let st = agent.state.lock().await; + let readout_shape = agent.readout.lock().ok().and_then(|buf| { + buf.manifest.as_ref().map(|m| { + (m.layers.len() as u32, m.concepts.len() as u32) + }) + }); agent.client.stream_session_mm( agent.grpc_session.clone(), - &prompt_tokens, - &images, + chunks, api::SamplingParams { temperature: st.temperature, top_p: st.top_p, top_k: st.top_k, }, st.priority, + readout_shape, ) }; diff --git a/src/mind/mod.rs b/src/mind/mod.rs index f1ddb54..b2eb77a 100644 --- a/src/mind/mod.rs +++ b/src/mind/mod.rs @@ -419,7 +419,9 @@ impl Mind { let subconscious = Arc::new(crate::Mutex::new(Subconscious::new())); subconscious.lock().await.init_output_tool(subconscious.clone()); - let unconscious = Arc::new(crate::Mutex::new(Unconscious::new())); + let unconscious = Arc::new(crate::Mutex::new( + Unconscious::new(agent.client.clone()), + )); // Spawn the unconscious loop on its own task if !config.no_agents { @@ -467,8 +469,11 @@ impl Mind { }; // Spawn agents outside lock + let client = unc.lock().await.client.clone(); for (idx, name, auto) in to_spawn { - match crate::mind::unconscious::prepare_spawn(&name, auto, wake.clone()).await { + match crate::mind::unconscious::prepare_spawn( + &name, auto, wake.clone(), client.clone(), + ).await { Ok(result) => unc.lock().await.complete_spawn(idx, result), Err(auto) => unc.lock().await.abort_spawn(idx, auto), } diff --git a/src/mind/unconscious.rs b/src/mind/unconscious.rs index 4f9a0ca..9c40e18 100644 --- a/src/mind/unconscious.rs +++ b/src/mind/unconscious.rs @@ -73,10 +73,15 @@ pub struct Unconscious { last_health_check: Option, /// Notified when agent state changes (finished, toggled) pub wake: std::sync::Arc, + /// Shared API client — cloned (cheap) into each spawned agent's + /// Agent::new call so they all share the manifest cache and + /// gRPC endpoint state. Override `.model` on the clone when a + /// per-agent backend differs from the default. + pub client: crate::agent::api::ApiClient, } impl Unconscious { - pub fn new() -> Self { + pub fn new(client: crate::agent::api::ApiClient) -> Self { let enabled_map = load_enabled_config(); // Scan all .agent files, exclude subconscious-* and surface-observe @@ -120,6 +125,7 @@ impl Unconscious { graph_health: None, last_health_check: None, wake: std::sync::Arc::new(tokio::sync::Notify::new()), + client, } } @@ -134,7 +140,8 @@ impl Unconscious { let agent_name = self.agents[idx].name.clone(); let auto = self.agents[idx].auto.take().unwrap(); let wake = self.wake.clone(); - match prepare_spawn(&agent_name, auto, wake).await { + let client = self.client.clone(); + match prepare_spawn(&agent_name, auto, wake, client).await { Ok(result) => self.complete_spawn(idx, result), Err(auto) => self.abort_spawn(idx, auto), } @@ -250,7 +257,12 @@ pub struct SpawnResult { /// Called outside the Unconscious lock. /// On success, auto is consumed (moved into spawned task). /// On failure, auto is returned so it can be restored. -pub async fn prepare_spawn(name: &str, mut auto: AutoAgent, wake: std::sync::Arc) -> Result { +pub async fn prepare_spawn( + name: &str, + mut auto: AutoAgent, + wake: std::sync::Arc, + base_client: crate::agent::api::ApiClient, +) -> Result { dbglog!("[unconscious] spawning {}", name); let def = match defs::get_def(name) { @@ -295,8 +307,10 @@ pub async fn prepare_spawn(name: &str, mut auto: AutoAgent, wake: std::sync::Arc }; // Unconscious agents have self-contained prompts — no standard context. - let client = crate::agent::api::ApiClient::new( - &resolved.api_base, &resolved.api_key, &resolved.model_id); + // Clone the shared client so we inherit the manifest cache and + // only override the model id per-agent. + let mut client = base_client; + client.model = resolved.model_id.clone(); let agent = crate::agent::Agent::new( client, Vec::new(), app, None, @@ -329,8 +343,9 @@ impl Unconscious { self.reap_finished(); let to_spawn = self.select_to_spawn(); let wake = self.wake.clone(); + let client = self.client.clone(); for (idx, name, auto) in to_spawn { - match prepare_spawn(&name, auto, wake.clone()).await { + match prepare_spawn(&name, auto, wake.clone(), client.clone()).await { Ok(result) => self.complete_spawn(idx, result), Err(auto) => self.abort_spawn(idx, auto), } diff --git a/src/subconscious/generate.rs b/src/subconscious/generate.rs index 757e08a..046911d 100644 --- a/src/subconscious/generate.rs +++ b/src/subconscious/generate.rs @@ -7,7 +7,7 @@ use std::sync::Arc; use crate::agent::api::{ApiClient, SamplingParams, StreamToken}; -use crate::agent::context::{AstNode, ContextState}; +use crate::agent::context::{AstNode, ContextState, WireChunk}; use crate::agent::tokenizer; /// Generate an assistant continuation from the context up to `entry_idx`, @@ -26,10 +26,18 @@ pub async fn gen_continuation( ) -> anyhow::Result where F: FnMut(&AstNode) -> bool, { - let (mut prompt, images, _) = context.wire_prompt(0..entry_idx, skip); + let mut chunks = context.wire_chunks(0..entry_idx, skip); - prompt.push(tokenizer::IM_START); - prompt.extend(tokenizer::encode("assistant\n")); + // Assistant-turn prologue. + let prologue = { + let mut t = vec![tokenizer::IM_START]; + t.extend(tokenizer::encode("assistant\n")); + t + }; + match chunks.last_mut() { + Some(WireChunk::Tokens(last)) => last.extend(prologue), + _ => chunks.push(WireChunk::Tokens(prologue)), + } let sampling = SamplingParams { temperature: 0.6, @@ -41,13 +49,19 @@ where F: FnMut(&AstNode) -> bool, // `_guard` drops at function end. let session_lock = Arc::new(crate::Mutex::new(None)); let (mut rx, _guard) = client.stream_session_mm( - session_lock, &prompt, &images, sampling, Some(-5), + session_lock, chunks, sampling, Some(-5), None, ); let mut tokens = Vec::new(); while let Some(tok) = rx.recv().await { match tok { StreamToken::Token { id, .. } => tokens.push(id), + StreamToken::ImageAppended { .. } => { + // subconscious/generate uses wire_chunks over an AST + // slice that shouldn't have unsized images — but if + // it ever does, we just don't care about updating the + // ephemeral session's AST view. + } StreamToken::Done { .. } => break, StreamToken::Error(e) => anyhow::bail!("generation error: {}", e), } From be6ba4e9a529ca90e6e92100513969b7a3859ae3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Apr 2026 12:37:20 -0400 Subject: [PATCH 03/31] agent: bundle sampling fields as SamplingParams on AgentState Collapse the split `temperature` / `top_p` / `top_k` fields on AgentState into a single `sampling: SamplingParams` struct, mirroring how the wire-level fields flow into the Generate RPC. Adds `max_tokens` to SamplingParams so it's actually plumbed end to end (previously the client had a hardcoded 4096 fallback inside `run_session_generate`). AgentState construction sites now set `sampling: SamplingParams { ... max_tokens: 4096 }` as the default. The assignment sites in oneshot.rs / subconscious.rs / unconscious.rs switch from `st.temperature = X` to `st.sampling.temperature = X`. `stream_session_mm` takes `SamplingParams` directly; the `sampling_max_tokens()` helper goes away. `pb::GenerateRequest` is populated with `sampling.max_tokens` (and the other fields) in `run_session_generate`. SamplingParams is `pub` so it can be embedded in the public AgentState without a visibility warning. Co-Authored-By: Proof of Concept --- src/agent/api/mod.rs | 31 ++++++++++++++++--------------- src/agent/mod.rs | 23 +++++++++-------------- src/agent/oneshot.rs | 2 +- src/mind/subconscious.rs | 2 +- src/mind/unconscious.rs | 2 +- src/subconscious/generate.rs | 1 + 6 files changed, 29 insertions(+), 32 deletions(-) diff --git a/src/agent/api/mod.rs b/src/agent/api/mod.rs index a7a87f7..0a86df5 100644 --- a/src/agent/api/mod.rs +++ b/src/agent/api/mod.rs @@ -38,6 +38,21 @@ pub struct ReadoutManifest { /// from pairing with the manifest fetched at startup. pub type TokenReadout = Vec>; +/// Client-side sampling state. Mirrors the wire-level fields in +/// `GenerateRequest` (proto flattened its `SamplingParams` submessage +/// in so the server handler reads them directly), but stays as a +/// grouped struct on the client because UI / config / tests pass +/// these around together. +#[derive(Clone, Copy)] +pub struct SamplingParams { + pub temperature: f32, + pub top_p: f32, + pub top_k: u32, + /// Decode budget. 0 = prefill only; >0 = decode up to this many + /// tokens, stopping early on EOS / stop_token_ids. + pub max_tokens: u32, +} + /// A JoinHandle that aborts its task when dropped. pub(crate) struct AbortOnDrop(tokio::task::JoinHandle<()>); @@ -47,13 +62,6 @@ impl Drop for AbortOnDrop { } } -/// Sampling parameters for model generation. -#[derive(Clone, Copy)] -pub(crate) struct SamplingParams { - pub temperature: f32, - pub top_p: f32, - pub top_k: u32, -} // ───────────────────────────────────────────────────────────── // Stream events — yielded by backends, consumed by the runner @@ -288,13 +296,12 @@ async fn run_session_generate( } else { Vec::new() }; - let max_tokens = sampling_max_tokens(&sampling); let req = pb::GenerateRequest { session_id: handle.session_id.clone(), append_tokens: pending, offset: handle.committed_len, truncating: false, - max_tokens, + max_tokens: sampling.max_tokens, logprobs_ranges: Vec::new(), logprob_top_k: 0, readout_ranges, @@ -422,10 +429,4 @@ async fn flush_pending( Ok(()) } -fn sampling_max_tokens(_sampling: &SamplingParams) -> u32 { - // SamplingParams doesn't carry max_tokens today; 4096 mirrors - // the old server-side default and is a sensible interactive cap. - // TODO: plumb from the caller if we need bigger budgets. - 4096 -} diff --git a/src/agent/mod.rs b/src/agent/mod.rs index a8e7592..613b106 100644 --- a/src/agent/mod.rs +++ b/src/agent/mod.rs @@ -177,9 +177,7 @@ pub struct AgentState { pub think_native: bool, /// Tool-based thinking — add a "think" tool for structured reasoning. pub think_tool: bool, - pub temperature: f32, - pub top_p: f32, - pub top_k: u32, + pub sampling: api::SamplingParams, pub activities: Vec, next_activity_id: u64, pub pending_yield: bool, @@ -241,9 +239,12 @@ impl Agent { reasoning_effort: "none".to_string(), think_native: true, think_tool: false, - temperature: 0.6, - top_p: 0.95, - top_k: 20, + sampling: api::SamplingParams { + temperature: 0.6, + top_p: 0.95, + top_k: 20, + max_tokens: 4096, + }, activities: Vec::new(), next_activity_id: 0, pending_yield: false, @@ -312,9 +313,7 @@ impl Agent { reasoning_effort: "none".to_string(), think_native: st.think_native, think_tool: st.think_tool, - temperature: st.temperature, - top_p: st.top_p, - top_k: st.top_k, + sampling: st.sampling, activities: Vec::new(), next_activity_id: 0, pending_yield: false, @@ -424,11 +423,7 @@ impl Agent { agent.client.stream_session_mm( agent.grpc_session.clone(), chunks, - api::SamplingParams { - temperature: st.temperature, - top_p: st.top_p, - top_k: st.top_k, - }, + st.sampling, st.priority, readout_shape, ) diff --git a/src/agent/oneshot.rs b/src/agent/oneshot.rs index 8bc8b53..314fd4e 100644 --- a/src/agent/oneshot.rs +++ b/src/agent/oneshot.rs @@ -269,7 +269,7 @@ impl AutoAgent { let mut st = agent.state.lock().await; st.provenance = format!("standalone:{}", self.name); st.tools = self.tools.clone(); - st.temperature = self.temperature; + st.sampling.temperature = self.temperature; st.priority = Some(self.priority); } diff --git a/src/mind/subconscious.rs b/src/mind/subconscious.rs index 21cc549..08dd090 100644 --- a/src/mind/subconscious.rs +++ b/src/mind/subconscious.rs @@ -631,7 +631,7 @@ impl Subconscious { { let mut st = forked.state.lock().await; st.provenance = auto.name.clone(); - st.temperature = auto.temperature; + st.sampling.temperature = auto.temperature; // Surface agent gets near-interactive priority; // other subconscious agents get lower priority. st.priority = Some(if auto.name == "surface" { 1 } else { auto.priority }); diff --git a/src/mind/unconscious.rs b/src/mind/unconscious.rs index 9c40e18..7f9798b 100644 --- a/src/mind/unconscious.rs +++ b/src/mind/unconscious.rs @@ -321,7 +321,7 @@ pub async fn prepare_spawn( let mut st = agent.state.lock().await; st.provenance = auto.name.clone(); st.priority = Some(auto.priority); - st.temperature = auto.temperature; + st.sampling.temperature = auto.temperature; } let agent_clone = agent.clone(); diff --git a/src/subconscious/generate.rs b/src/subconscious/generate.rs index 046911d..625b619 100644 --- a/src/subconscious/generate.rs +++ b/src/subconscious/generate.rs @@ -43,6 +43,7 @@ where F: FnMut(&AstNode) -> bool, temperature: 0.6, top_p: 0.95, top_k: 20, + max_tokens: 4096, }; // Ephemeral per-call session — opens on first touch, drops when From 4feebb7bc4bfb2cd9ea8d615c754ee6f670cf31a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Apr 2026 12:51:53 -0400 Subject: [PATCH 04/31] agent: share one tonic Channel + migrate scoring to gRPC Generate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two changes that bolt together — the shared connection means the new scoring path actually costs one HTTP/2 handshake across the whole process instead of one-per-RPC. ApiClient gains `salience_channel: Arc>`. First call to `ApiClient::salience_client()` opens the channel via `connect_channel()` and stores the Channel; subsequent calls clone it (cheap — tonic multiplexes concurrent RPCs over the single HTTP/2 connection). Every ApiClient clone shares the same OnceCell, so all agents spawned from Mind's client — plus every ephemeral scoring session — reuse one connection. SessionHandle refactored to hold an `ApiClient` clone instead of a bag of (base_url, api_key) strings. `open` / `append_image` / `generate` go through `self.client.salience_client()` now. New `prefill_only(tokens)` method encapsulates the "Generate with max_tokens=0 to append text" pattern (previously a private free function in api/mod.rs called `flush_pending`). Drop impl on SessionHandle stays — still fires CloseSession on the shared channel in a detached task. `run_session_generate` switched from `(base_url, api_key, model)` to `&ApiClient`; the agent-turn flow that uses it keeps the same shape but `stream_session_mm` clones the ApiClient into the spawned worker. learn.rs migrated from the HTTP `/v1/score` endpoint to a gRPC session-based score: * `call_score` opens an ephemeral SessionHandle on the client, converts (prompt_tokens, images) → Vec via the new `prompt_to_chunks` helper (splits on VISION_START/VISION_END), walks chunks calling `prefill_only` + `append_image`, runs a final Generate with `max_tokens=0` + `logprobs_ranges` over the scored positions, and sums each Token event's `sampled_logprob` per range to produce `ScoreResult`s. * SessionHandle drops at end of scope → CloseSession auto-fires, keeping the server's session map clean between calls. * No more HTTP path, no more `http_client()` helper, no more `ScoreResponse` / serde plumbing for /v1/score. * `send_to_train` still uses HTTP (it talks to /v1/train which isn't on the gRPC protocol); its ad-hoc HTTP client lives inline now instead of reaching for the deleted `http_client()`. Co-Authored-By: Proof of Concept --- src/agent/api/mod.rs | 77 ++++++-------- src/agent/api/salience.rs | 207 ++++++++++++++++++++------------------ src/subconscious/learn.rs | 199 +++++++++++++++++++++++------------- 3 files changed, 269 insertions(+), 214 deletions(-) diff --git a/src/agent/api/mod.rs b/src/agent/api/mod.rs index 0a86df5..1352d5f 100644 --- a/src/agent/api/mod.rs +++ b/src/agent/api/mod.rs @@ -93,6 +93,13 @@ pub struct ApiClient { /// across ApiClient clones (every Agent/fork gets the same cell). /// `None` after fetch means the server has readout disabled (404). manifest: std::sync::Arc>>, + /// Shared tonic Channel to the salience gRPC endpoint. Opened on + /// first use and reused across every SessionHandle / RPC call + /// derived from this ApiClient. tonic multiplexes concurrent + /// requests over the HTTP/2 connection automatically. + salience_channel: std::sync::Arc< + tokio::sync::OnceCell + >, } impl ApiClient { @@ -108,9 +115,27 @@ impl ApiClient { model: model.to_string(), base_url: base_url.trim_end_matches('/').to_string(), manifest: std::sync::Arc::new(tokio::sync::OnceCell::new()), + salience_channel: std::sync::Arc::new(tokio::sync::OnceCell::new()), } } + /// Return a `SalienceClient` on the shared gRPC channel — opens + /// the channel on first call and reuses it thereafter across + /// every ApiClient clone. All scoring / inference / session + /// RPCs flow through this single multiplexed HTTP/2 connection. + pub async fn salience_client(&self) -> Result< + salience::pb::salience_client::SalienceClient + > { + let ch = self.salience_channel.get_or_try_init(|| async { + let grpc_url = salience::derive_grpc_url(&self.base_url); + log::debug!(target: "grpc", + "opening shared salience channel: http_base={} -> grpc_url={}", + self.base_url, grpc_url); + salience::connect_channel(&grpc_url).await + }).await?; + Ok(salience::pb::salience_client::SalienceClient::new(ch.clone())) + } + /// Stream generation via a gRPC session. Walks the prompt chunks /// comparing against the session's `committed_len`, sends the /// delta as interleaved `AppendImage` + intermediate @@ -130,14 +155,12 @@ impl ApiClient { readout_shape: Option<(u32, u32)>, ) -> (mpsc::UnboundedReceiver, AbortOnDrop) { let (tx, rx) = mpsc::unbounded_channel(); - let base_url = self.base_url.clone(); - let api_key = self.api_key.clone(); - let model = self.model.clone(); + let client = self.clone(); let handle = tokio::spawn(async move { let result = run_session_generate( - session_lock, &base_url, &api_key, &model, - chunks, sampling, priority, readout_shape, &tx, + session_lock, &client, chunks, sampling, priority, + readout_shape, &tx, ).await; if let Err(e) = result { log::warn!(target: "grpc", @@ -195,9 +218,7 @@ impl ApiClient { /// handle is dropped so the next call reopens. async fn run_session_generate( session_lock: std::sync::Arc>>, - base_url: &str, - api_key: &str, - model: &str, + client: &ApiClient, chunks: Vec, sampling: SamplingParams, priority: Option, @@ -216,7 +237,7 @@ async fn run_session_generate( None => { drop(guard); log::debug!(target: "grpc", "run_session_generate: opening new session"); - salience::SessionHandle::open(base_url, api_key, model).await? + salience::SessionHandle::open(client).await? } } }; @@ -268,7 +289,7 @@ async fn run_session_generate( WireChunk::Tokens(t) => pending.extend_from_slice(t), WireChunk::Image { bytes, mime, .. } => { if !pending.is_empty() { - flush_pending(&mut handle, std::mem::take(&mut pending)).await?; + handle.prefill_only(std::mem::take(&mut pending)).await?; } let resp = handle .append_image(bytes.clone(), mime.clone(), false) @@ -394,39 +415,3 @@ async fn run_session_generate( Ok(()) } -/// Emit a prefill-only Generate for the pending token run. Used to -/// append text that separates two image blocks — the server needs -/// those tokens in its session before we AppendImage the next image, -/// but we don't want the cost or output of a decode step. -async fn flush_pending( - handle: &mut salience::SessionHandle, - tokens: Vec, -) -> Result<()> { - use futures::StreamExt; - use salience::pb; - let req = pb::GenerateRequest { - session_id: handle.session_id.clone(), - append_tokens: tokens, - offset: handle.committed_len, - truncating: false, - max_tokens: 0, - logprobs_ranges: Vec::new(), - logprob_top_k: 0, - readout_ranges: Vec::new(), - temperature: 0.0, - top_p: 0.0, - top_k: 0, - stop_token_ids: Vec::new(), - priority: 0, - }; - let mut stream = handle.generate(req).await?; - while let Some(event) = stream.next().await { - let event = event.map_err(|s| anyhow::anyhow!("flush Generate stream: {}", s))?; - if let Some(pb::generate_event::Event::Done(d)) = event.event { - handle.committed_len = d.total_tokens; - } - } - Ok(()) -} - - diff --git a/src/agent/api/salience.rs b/src/agent/api/salience.rs index 18f0d7b..bba950f 100644 --- a/src/agent/api/salience.rs +++ b/src/agent/api/salience.rs @@ -24,7 +24,12 @@ pub type SalienceClient = pb::salience_client::SalienceClient; /// looks like `https://host:8443`. User-provided CA certs under /// `~/.consciousness/certs/` are trusted in addition to the system /// roots (for self-signed server certs). -pub async fn connect(base_url: &str) -> Result { +/// +/// Returns the raw `Channel` so callers (`ApiClient::salience_client`) +/// can cache it and clone a `SalienceClient` per request without +/// reopening the TCP/TLS connection. tonic multiplexes RPCs over the +/// shared channel automatically. +pub async fn connect_channel(base_url: &str) -> Result { let mut endpoint = Endpoint::from_shared(base_url.to_string()) .with_context(|| format!("invalid salience endpoint: {}", base_url))? .connect_timeout(std::time::Duration::from_secs(30)) @@ -41,11 +46,10 @@ pub async fn connect(base_url: &str) -> Result { .with_context(|| "configuring tonic TLS")?; } - let channel = endpoint + endpoint .connect() .await - .with_context(|| format!("failed to connect to salience server at {}", base_url))?; - Ok(pb::salience_client::SalienceClient::new(channel)) + .with_context(|| format!("failed to connect to salience server at {}", base_url)) } /// Derive the gRPC base URL from the HTTP completions base URL. @@ -76,107 +80,42 @@ pub fn with_auth(req: &mut tonic::Request, api_key: &str) { } } -/// Call the server's `OpenSession` RPC and return the response. -pub async fn open_session( - base_url: &str, - api_key: &str, - model: &str, -) -> Result { - let mut client = connect(base_url).await?; - let mut req = tonic::Request::new(pb::OpenSessionRequest { - model: model.to_string(), - }); - with_auth(&mut req, api_key); - let resp = client - .open_session(req) - .await - .with_context(|| "OpenSession RPC failed")?; - Ok(resp.into_inner()) -} - -/// Call the server's `CloseSession` RPC. Idempotent on the server. -pub async fn close_session(base_url: &str, api_key: &str, session_id: &str) -> Result<()> { - let mut client = connect(base_url).await?; - let mut req = tonic::Request::new(pb::CloseSessionRequest { - session_id: session_id.to_string(), - }); - with_auth(&mut req, api_key); - client - .close_session(req) - .await - .with_context(|| "CloseSession RPC failed")?; - Ok(()) -} - -/// Append an image to a session. Server decodes the image, computes N -/// via vLLM's own multimodal pipeline, writes the full vision block -/// (`<|vision_start|> + IMAGE_PAD×N + <|vision_end|>`) into -/// session.tokens, and returns (N, new total length). -/// -/// `offset` is the client's view of the session's current token count; -/// the server rejects if it diverges from its own (unless -/// `truncating=true`, in which case the server slices to `offset` -/// first — but never through a vision block). -pub async fn append_image( - base_url: &str, - api_key: &str, - session_id: &str, - data: Vec, - mime: String, - offset: u32, - truncating: bool, -) -> Result { - let mut client = connect(base_url).await?; - let mut req = tonic::Request::new(pb::AppendImageRequest { - session_id: session_id.to_string(), - data, - mime, - offset, - truncating, - }); - with_auth(&mut req, api_key); - let resp = client - .append_image(req) - .await - .with_context(|| "AppendImage RPC failed")?; - Ok(resp.into_inner()) -} - -/// Handle to a server-side session. Carries the id + connection params -/// so subsequent per-session RPCs (AppendImage, Generate, ForkSession) -/// can be issued without the caller juggling base_url / api_key each -/// time. `committed_len` tracks the server's current session.tokens -/// length so the client can submit deltas with the right `offset`. +/// Handle to a server-side session. Carries the id + an `ApiClient` +/// clone (which holds the shared tonic Channel) so subsequent +/// per-session RPCs go over the process-global connection. +/// `committed_len` tracks the server's current session.tokens length +/// so the client can submit deltas with the right `offset`. pub struct SessionHandle { pub session_id: String, pub max_model_len: u32, - pub base_url: String, - pub api_key: String, pub committed_len: u32, + client: super::ApiClient, } impl SessionHandle { - pub async fn open(base_url: &str, api_key: &str, model: &str) -> Result { - let grpc_url = derive_grpc_url(base_url); - log::debug!(target: "grpc", - "SessionHandle::open http_base={} -> grpc_url={}", - base_url, grpc_url); - let resp = open_session(&grpc_url, api_key, model).await?; + pub async fn open(client: &super::ApiClient) -> Result { + let mut c = client.salience_client().await?; + let mut req = tonic::Request::new(pb::OpenSessionRequest { + model: client.model.clone(), + }); + with_auth(&mut req, client.api_key()); + let resp = c + .open_session(req) + .await + .with_context(|| "OpenSession RPC failed")? + .into_inner(); log::debug!(target: "grpc", "SessionHandle::open session_id={} max_model_len={}", resp.session_id, resp.max_model_len); Ok(Self { session_id: resp.session_id, max_model_len: resp.max_model_len, - base_url: grpc_url, - api_key: api_key.to_string(), committed_len: 0, + client: client.clone(), }) } - pub async fn close(self) -> Result<()> { - close_session(&self.base_url, &self.api_key, &self.session_id).await - } + pub fn client(&self) -> &super::ApiClient { &self.client } /// Append an image via the server-side vision block. Updates /// `committed_len` from the server's response on success. @@ -186,37 +125,105 @@ impl SessionHandle { mime: String, truncating: bool, ) -> Result { - let resp = append_image( - &self.base_url, - &self.api_key, - &self.session_id, + let mut c = self.client.salience_client().await?; + let mut req = tonic::Request::new(pb::AppendImageRequest { + session_id: self.session_id.clone(), data, mime, - self.committed_len, + offset: self.committed_len, truncating, - ) - .await?; + }); + with_auth(&mut req, self.client.api_key()); + let resp = c + .append_image(req) + .await + .with_context(|| "AppendImage RPC failed")? + .into_inner(); self.committed_len = resp.total_length; Ok(resp) } /// Open a gRPC Generate stream with the given request. Caller /// iterates the returned stream of GenerateEvents; the handle's - /// `committed_len` is advanced on Done based on the Done event's - /// `total_tokens` field. + /// `committed_len` should be advanced by the caller on Done based + /// on the Done event's `total_tokens` field. pub async fn generate( &self, req: pb::GenerateRequest, ) -> Result> { - let mut client = connect(&self.base_url).await?; + let mut c = self.client.salience_client().await?; let mut req = tonic::Request::new(req); - with_auth(&mut req, &self.api_key); - let resp = client + with_auth(&mut req, self.client.api_key()); + let resp = c .generate(req) .await .with_context(|| "Generate RPC failed")?; Ok(resp.into_inner()) } + + /// Run a prefill-only Generate (max_tokens=0) that appends the + /// given tokens to the session. No decode, no Token events — the + /// server just extends session.tokens and runs prefill to warm + /// the KV cache. Used to interleave text runs between AppendImage + /// calls, and by score paths that want prompt_logprobs without a + /// decode step. + pub async fn prefill_only(&mut self, tokens: Vec) -> Result<()> { + use futures::StreamExt; + let req = pb::GenerateRequest { + session_id: self.session_id.clone(), + append_tokens: tokens, + offset: self.committed_len, + truncating: false, + max_tokens: 0, + logprobs_ranges: Vec::new(), + logprob_top_k: 0, + readout_ranges: Vec::new(), + temperature: 0.0, + top_p: 0.0, + top_k: 0, + stop_token_ids: Vec::new(), + priority: 0, + }; + let mut stream = self.generate(req).await?; + while let Some(event) = stream.next().await { + let event = event.map_err(|s| anyhow::anyhow!("prefill Generate stream: {}", s))?; + if let Some(pb::generate_event::Event::Done(d)) = event.event { + self.committed_len = d.total_tokens; + } + } + Ok(()) + } +} + +/// Drop → fire CloseSession in a detached task so servers don't leak +/// sessions until TTL eviction. Best-effort: if no tokio runtime is +/// available we skip; the server's 30min TTL will reap it eventually. +impl Drop for SessionHandle { + fn drop(&mut self) { + if self.session_id.is_empty() { + return; + } + let session_id = std::mem::take(&mut self.session_id); + let client = self.client.clone(); + let Ok(rt) = tokio::runtime::Handle::try_current() else { + log::debug!(target: "grpc", + "SessionHandle drop outside tokio runtime, session {} leaks to TTL", + session_id); + return; + }; + rt.spawn(async move { + let Ok(mut c) = client.salience_client().await else { return }; + let mut req = tonic::Request::new(pb::CloseSessionRequest { + session_id: session_id.clone(), + }); + with_auth(&mut req, client.api_key()); + if let Err(e) = c.close_session(req).await { + log::debug!(target: "grpc", + "CloseSession on drop failed for {}: {:#}", + session_id, e); + } + }); + } } #[cfg(test)] diff --git a/src/subconscious/learn.rs b/src/subconscious/learn.rs index 3021fc3..dca9b3c 100644 --- a/src/subconscious/learn.rs +++ b/src/subconscious/learn.rs @@ -1,100 +1,166 @@ -// training.rs — Memory importance scoring via /v1/score +// learn.rs — Memory importance scoring over the salience gRPC protocol. // -// Three scoring modes, all built on the same call_score() primitive: +// Three scoring modes, all built on call_score(): // // score_memories() — Full N×M matrix (memories × responses) for the -// debug screen. Expensive: N+1 API calls. +// debug screen. Expensive: N+1 sessions/calls. // -// memory_score() — Single memory importance. Scores the 50 messages +// score_memory() — Single memory importance. Scores the 50 messages // after it was surfaced, with/without that memory. -// 2 API calls. +// 2 calls. // // finetune_score() — Identifies training candidates. Scores recent // messages with all memories stripped. Responses // with high divergence depend on memories the model -// hasn't internalized. 2 API calls. +// hasn't internalized. 2 calls. +// +// Each call opens an ephemeral gRPC session (reusing the shared +// tonic Channel on `ApiClient`), pushes the prompt through as +// interleaved tokens + AppendImage calls, runs Generate with +// max_tokens=0 + logprobs_ranges over the scored positions, collects +// each Token event's sampled_logprob, then drops the SessionHandle — +// which triggers a best-effort CloseSession over the shared channel. use std::sync::Arc; use crate::agent::api::ApiClient; +use crate::agent::api::salience::{SessionHandle, pb}; use crate::agent::context::{ - Ast, AstNode, ContextState, Role, WireImage, + Ast, AstNode, ContextState, Role, WireChunk, WireImage, is_assistant, is_memory_node, memory_key, render_branch_text, render_prior_context, }; +use crate::agent::tokenizer; use crate::mind::{MindState, MindTriggered, TaskHandle}; use crate::subconscious::generate::gen_continuation; -const SCORE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(300); - // ── Score API ─────────────────────────────────────────────────── -#[derive(serde::Deserialize)] +#[derive(Debug, Clone)] struct ScoreResult { total_logprob: f64, } -#[derive(serde::Deserialize)] -struct ScoreResponse { - scores: Vec, -} - -fn http_client() -> crate::agent::api::http::HttpClient { - crate::agent::api::http::HttpClient::builder() - .timeout(SCORE_TIMEOUT) - .build() +/// Convert a flat (prompt_tokens, images) pair into the interleaved +/// chunks the session protocol expects. Tokens up to the next +/// `<|vision_start|>` become a Tokens chunk; each +/// `<|vision_start|>..<|vision_end|>` run collapses into one Image +/// chunk paired by position with the next entry in `images`. The +/// server re-expands the IMAGE_PADs on AppendImage. +fn prompt_to_chunks(prompt: &[u32], images: &[WireImage]) -> Vec { + let mut out: Vec = Vec::new(); + let mut cur = 0; + let mut img_idx = 0; + while cur < prompt.len() { + if prompt[cur] == tokenizer::VISION_START { + let end_rel = prompt[cur..].iter() + .position(|&t| t == tokenizer::VISION_END) + .unwrap_or_else(|| panic!( + "unmatched VISION_START at position {} in prompt", cur)); + let end = cur + end_rel + 1; + let img = images.get(img_idx) + .unwrap_or_else(|| panic!( + "image index {} out of range for {} images", img_idx, images.len())); + out.push(WireChunk::Image { + bytes: img.bytes.clone(), + mime: img.mime.clone(), + known_expanded_len: (end - cur) as u32, + }); + img_idx += 1; + cur = end; + } else { + let next_vs = prompt[cur..].iter() + .position(|&t| t == tokenizer::VISION_START); + let end = match next_vs { + Some(o) => cur + o, + None => prompt.len(), + }; + out.push(WireChunk::Tokens(prompt[cur..end].to_vec())); + cur = end; + } + } + out } async fn call_score( - http: &crate::agent::api::http::HttpClient, client: &ApiClient, prompt: &[u32], images: &[WireImage], ranges: &[(usize, usize)], priority: Option, ) -> anyhow::Result> { + use futures::StreamExt; + // Nothing to score — skip the round-trip. if ranges.is_empty() { return Ok(Vec::new()); } - let url = format!("{}/score", client.base_url()); - let auth = format!("Bearer {}", client.api_key()); - let mut body = serde_json::json!({ - "model": client.model, - "prompt": prompt, - "score_ranges": ranges, - "logprobs": 1, - }); - if !images.is_empty() { - use base64::Engine; - let b64 = base64::engine::general_purpose::STANDARD; - let uris: Vec = images.iter() - .map(|img| format!("data:{};base64,{}", img.mime, b64.encode(&img.bytes))) - .collect(); - body["multi_modal_data"] = serde_json::json!({ "image": uris }); - } - if let Some(p) = priority { - body["priority"] = serde_json::json!(p); - } - let response = http - .send_json("POST", &url, &[ - ("authorization", &auth), - ], &body) - .await?; - let status = response.status(); - let body: serde_json::Value = response.json().await?; + let chunks = prompt_to_chunks(prompt, images); + let mut handle = SessionHandle::open(client).await?; - if !status.is_success() { - let msg = body.get("error").and_then(|e| e.as_str()).unwrap_or("unknown error"); - anyhow::bail!("score API HTTP {}: {}", status, msg); - } - if let Some(err) = body.get("error").and_then(|e| e.as_str()) { - anyhow::bail!("score API error: {}", err); + // Walk chunks: AppendImage for each image, prefill-only Generate + // for each text run between images. Accumulate any trailing text + // run into `pending` for the final logprob-generating Generate. + let mut pending: Vec = Vec::new(); + for chunk in chunks { + match chunk { + WireChunk::Tokens(t) => pending.extend(t), + WireChunk::Image { bytes, mime, .. } => { + if !pending.is_empty() { + handle.prefill_only(std::mem::take(&mut pending)).await?; + } + handle.append_image(bytes, mime, false).await?; + } + } } - let result: ScoreResponse = serde_json::from_value(body) - .map_err(|e| anyhow::anyhow!("failed to parse score response: {}", e))?; - Ok(result.scores) + // Final Generate: max_tokens=0 so the server runs prefill of the + // trailing `pending` tokens and emits Token events for each + // position covered by logprobs_ranges, then Done. logprob_top_k=0 + // means "just the sampled (prompt) token's logprob" — no top-k + // alternatives, which is all call_score historically needed. + let logprobs_ranges: Vec = ranges.iter() + .map(|(s, e)| pb::PositionRange { start: *s as u32, end: *e as u32 }) + .collect(); + let req = pb::GenerateRequest { + session_id: handle.session_id.clone(), + append_tokens: pending, + offset: handle.committed_len, + truncating: false, + max_tokens: 0, + logprobs_ranges, + logprob_top_k: 0, + readout_ranges: Vec::new(), + temperature: 0.0, + top_p: 0.0, + top_k: 0, + stop_token_ids: Vec::new(), + priority: priority.unwrap_or(0), + }; + + let mut stream = handle.generate(req).await?; + let mut totals = vec![0.0f64; ranges.len()]; + while let Some(event) = stream.next().await { + let event = event + .map_err(|s| anyhow::anyhow!("score Generate stream: {}", s))?; + let Some(inner) = event.event else { continue }; + match inner { + pb::generate_event::Event::Token(t) => { + if !t.has_sampled_logprob { continue; } + let pos = t.position as usize; + for (i, (start, end)) in ranges.iter().enumerate() { + if pos >= *start && pos < *end { + totals[i] += t.sampled_logprob as f64; + } + } + } + pb::generate_event::Event::Done(_) => break, + } + } + + Ok(totals.into_iter() + .map(|total_logprob| ScoreResult { total_logprob }) + .collect()) } /// Compute per-position logprob divergence: how much worse the model @@ -110,7 +176,6 @@ fn divergence(baseline: &[ScoreResult], without: &[ScoreResult]) -> Vec { /// Score two message sets and return total divergence. async fn score_divergence( - http: &crate::agent::api::http::HttpClient, client: &ApiClient, context: &ContextState, range: std::ops::Range, @@ -123,9 +188,9 @@ where F: FnMut(&AstNode) -> bool, context.wire_prompt(range.clone(), |_| false); let (without_tokens, without_images, without_ranges) = context.wire_prompt(range, skip); - let baseline = call_score(http, client, &baseline_tokens, &baseline_images, + let baseline = call_score(client, &baseline_tokens, &baseline_images, &baseline_ranges, priority).await?; - let without = call_score(http, client, &without_tokens, &without_images, + let without = call_score(client, &without_tokens, &without_images, &without_ranges, priority).await?; let divs = divergence(&baseline, &without); Ok((divs, baseline)) @@ -162,14 +227,13 @@ pub async fn score_memories( dbglog!("[scoring-full] starting: {} memories × {} responses", total, response_indices.len()); - let http = http_client(); let activity = crate::agent::start_activity(agent, "scoring: baseline").await; let (baseline_tokens, baseline_images, baseline_ranges) = { let ctx = agent.context.lock().await; ctx.wire_prompt(0..ctx.conversation().len(), |_| false) }; - let baseline = call_score(&http, client, &baseline_tokens, &baseline_images, + let baseline = call_score(client, &baseline_tokens, &baseline_images, &baseline_ranges, Some(5)).await?; dbglog!("[scoring-full] baseline done ({} response scores)", baseline.len()); @@ -180,7 +244,7 @@ pub async fn score_memories( let ctx = agent.context.lock().await; ctx.wire_prompt(0..ctx.conversation().len(), |n| memory_key(n) == Some(key.as_str())) }; - let row = match call_score(&http, client, &tokens, &images, &ranges, Some(5)).await { + let row = match call_score(client, &tokens, &images, &ranges, Some(5)).await { Ok(without) => { let divs = divergence(&baseline, &without); let max_div = divs.iter().cloned().fold(0.0f64, f64::max); @@ -263,8 +327,7 @@ pub async fn score_memory( return Ok(0.0); } - let http = http_client(); - let (divs, _) = score_divergence(&http, client, context, range, + let (divs, _) = score_divergence(client, context, range, |n| memory_key(n) == Some(key), Some(5)).await?; Ok(divs.iter().sum()) @@ -322,7 +385,6 @@ where // Score oldest-first candidates.sort_by_key(|&(_, _, last)| last); - let http = http_client(); let mut scored = 0; let entries = context.conversation(); @@ -357,7 +419,7 @@ where } activity.update(format!("scoring: {}/{} {}", scored + 1, total, key)).await; - match score_divergence(&http, client, context, range, + match score_divergence(client, context, range, |n| memory_key(n) == Some(key), Some(5)).await { Ok((divs, _)) => { let n_responses = divs.len(); @@ -505,8 +567,7 @@ pub async fn score_finetune( return Ok(Vec::new()); } - let http = http_client(); - let (divs, _) = score_divergence(&http, client, context, range, is_memory_node, Some(5)).await?; + let (divs, _) = score_divergence(client, context, range, is_memory_node, Some(5)).await?; let mut results: Vec<(usize, f64)> = response_positions.iter() .enumerate() @@ -804,8 +865,10 @@ pub async fn send_to_train( } }); - let http = http_client(); let url = format!("{}/train", client.base_url()); + let http = crate::agent::api::http::HttpClient::builder() + .timeout(std::time::Duration::from_secs(300)) + .build(); let response = http.send_json("POST", &url, &[], &body).await?; let status = response.status(); From fe232cf292d05dcad1524cc3455fba5985155bce Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Apr 2026 20:26:47 -0400 Subject: [PATCH 05/31] salience: client-side pad expansion, drop AppendImage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors the vLLM-side rewrite. AppendImage is gone; images now ride along on Generate via a parallel `images` list. - Productionize `qwen3_image_token_count` (was test-only). Image leaf computes its IMAGE_PAD count eagerly at construction from height/width; `token_count` is no longer "0 until the server tells us." - WireChunk shrinks to a single `Tokens(Vec)` variant — vision blocks live inline in the token stream. - `wire_chunks` now returns `(Vec, Vec)`. `WireImage` carries `pad_start` / `pad_end` (absolute positions in the full walk) alongside bytes + mime. - `assemble_prompt` returns `(chunks, images, match_upto)`. - `stream_session_mm` / `run_session_generate` take the parallel images list, filter to those past `match_upto`, and pass them in `GenerateRequest.images` as `pb::ImageAttachment` entries. - Drop `SessionHandle::append_image`, `ContextState::commit_image_token_counts`, `StreamToken::ImageAppended`, the WireChunk::Image branch in `learn.rs`, and the now-empty `prompt_to_chunks` helper. - Add 'v' toggle on the conscious-screen tree to render token-id vectors in place of text content (debug-aid: lets us see what the server actually has when output is suspicious). - Comment out the subconscious-trigger spawn loop — Kent had this disabled before; it had crept back into running. Co-Authored-By: Proof of Concept --- proto/salience.proto | 114 ++++++----- src/agent/api/mod.rs | 122 ++++++------ src/agent/api/salience.rs | 39 ++-- src/agent/context.rs | 363 +++++++++++++++++++++++------------ src/agent/mod.rs | 13 +- src/agent/tools/vision.rs | 2 +- src/mind/mod.rs | 4 +- src/subconscious/generate.rs | 10 +- src/subconscious/learn.rs | 62 +++--- src/user/context.rs | 5 + src/user/subconscious.rs | 2 + src/user/widgets.rs | 48 ++++- 12 files changed, 473 insertions(+), 311 deletions(-) diff --git a/proto/salience.proto b/proto/salience.proto index 01c0f1e..fab4e6d 100644 --- a/proto/salience.proto +++ b/proto/salience.proto @@ -58,21 +58,26 @@ service Salience { // boundary). rpc ForkSession(ForkSessionRequest) returns (ForkSessionResponse); - // Append an image to the session. Server decodes, runs vLLM's - // multimodal pipeline to compute N (IMAGE_PAD count), and writes - // the whole vision block into session.tokens. Returns N and the - // new total length. - rpc AppendImage(AppendImageRequest) returns (AppendImageResponse); - - // Prefill + optionally decode. See GenerateRequest for full - // semantics; stream yields Token events (with optional readouts / - // logprobs per position) followed by a terminating Done. + // Prefill + optionally decode. Images are attached inline via + // `GenerateRequest.images`; the client writes its own pre-expanded + // <|vision_start|> + N*<|image_pad|> + <|vision_end|> runs into + // `append_tokens` and declares each run's range in `images[i]`. + // Server validates run length against the actual vision-encoder + // feature count and returns INVALID_ARGUMENT on mismatch. Stream + // yields Token events (with optional readouts / logprobs per + // position) followed by a terminating Done. rpc Generate(GenerateRequest) returns (stream GenerateEvent); // Readout manifest for the currently-loaded model — concept names, // layer indices, tensor dtype. Stateless; fetch once at client // startup and cache. rpc GetReadoutManifest(GetReadoutManifestRequest) returns (ReadoutManifest); + + // Dump the full token stream of a session. Debug-only: used by the + // client to verify its local accounting against the server's + // session.tokens byte-for-byte when divergence is suspected. Not + // cheap — copies the whole sequence across the wire. + rpc DumpSession(DumpSessionRequest) returns (DumpSessionResponse); } // ============================================================ @@ -106,55 +111,47 @@ message ForkSessionResponse { string session_id = 1; // new session } -// ============================================================ -// Mutation -// ============================================================ - -message AppendImageRequest { - string session_id = 1; - - // Image bytes (PNG / JPEG / WebP / …). - bytes data = 2; - - // MIME type, e.g. "image/png". - string mime = 3; - - // Client's view of the session's current token length. Must equal - // the server's actual length, OR be strictly less when - // truncating=true. Any mismatch is FAILED_PRECONDITION. - uint32 offset = 4; - - // If true, server truncates session.tokens to `offset` before - // appending. Rejected with FAILED_PRECONDITION if the truncation - // would split an image block. - bool truncating = 5; -} - -message AppendImageResponse { - // Count of <|image_pad|> tokens inside the vision block. Does not - // include the <|vision_start|> / <|vision_end|> bookends, which - // contribute one token each. - uint32 placeholder_count = 1; - - // Session's total token length after this append, including both - // bookends (= offset + placeholder_count + 2, barring truncation). - uint32 total_length = 2; -} - // ============================================================ // Inference // ============================================================ +// One image attached to a Generate call. The client is responsible +// for writing the expanded placeholder run (VISION_START + +// N*IMAGE_PAD + VISION_END) into `GenerateRequest.append_tokens` at +// positions [pad_range_start, pad_range_end) and pairing it with +// the corresponding `ImageAttachment` entry. Server validates that +// the declared range's pad count matches what the vision encoder +// produces, and returns INVALID_ARGUMENT if they disagree. +message ImageAttachment { + // Image bytes (PNG / JPEG / WebP / …). + bytes bytes = 1; + + // MIME type, e.g. "image/png". + string mime = 2; + + // Absolute token positions (in `session.tokens` AFTER `append_tokens` + // is applied) spanning the full vision block — `[vision_start, + // pad*N, vision_end]`. end is exclusive, so end - start == N + 2. + uint32 pad_range_start = 3; + uint32 pad_range_end = 4; +} + message GenerateRequest { string session_id = 1; - // Tokens to append before prefill. May be empty. Client must NOT - // include vision tokens (<|vision_start|>, <|image_pad|>, - // <|vision_end|>) — those live in the session via AppendImage. + // Tokens to append before prefill. May be empty. Client writes the + // full vision block (VISION_START + N*IMAGE_PAD + VISION_END) for + // any newly-attached image directly into this stream; each such + // block must be paired with a matching entry in `images`. The + // server validates that the declared ranges all point at IMAGE_PAD + // runs and that each run's length matches what the vision encoder + // produces for the corresponding image. repeated uint32 append_tokens = 2; - // Offset / truncating — same semantics as AppendImage. Truncation - // that splits an image block is FAILED_PRECONDITION. + // Client's view of session.tokens length at the time of the call. + // Must equal server's actual length, OR be strictly less when + // truncating=true (server rewinds before appending). Any other + // mismatch is FAILED_PRECONDITION. uint32 offset = 3; bool truncating = 4; @@ -185,6 +182,12 @@ message GenerateRequest { // vLLM scheduler priority (0 = interactive, 10 = batch). int32 priority = 13; + + // Images newly attached on this call. Each entry describes one + // image's binary bytes, its mime type, and the exact token-position + // range of its pre-expanded placeholder run inside `session.tokens` + // after `append_tokens` is applied. See `ImageAttachment`. + repeated ImageAttachment images = 14; } message PositionRange { @@ -258,3 +261,16 @@ message ReadoutManifest { uint32 hidden_size = 3; string dtype = 4; } + +// ============================================================ +// Debug +// ============================================================ + +message DumpSessionRequest { + string session_id = 1; +} + +message DumpSessionResponse { + // The full session.tokens sequence, verbatim. + repeated uint32 tokens = 1 [packed = true]; +} diff --git a/src/agent/api/mod.rs b/src/agent/api/mod.rs index 1352d5f..5705d89 100644 --- a/src/agent/api/mod.rs +++ b/src/agent/api/mod.rs @@ -73,12 +73,6 @@ pub enum StreamToken { /// `readout` is `None` when the server has readout disabled or /// returned no readout for this chunk. Token { id: u32, readout: Option }, - /// An image was committed server-side via AppendImage during this - /// stream. `placeholder_count` is the N IMAGE_PADs the server - /// wrote. Emitted in AST order — caller applies these counts to - /// the first-N image leaves that currently have token_count=0 - /// via `ContextState::commit_image_token_counts`. - ImageAppended { placeholder_count: u32 }, Done { usage: Option }, Error(String), } @@ -150,6 +144,8 @@ impl ApiClient { &self, session_lock: std::sync::Arc>>, chunks: Vec, + images: Vec, + match_upto: u32, sampling: SamplingParams, priority: Option, readout_shape: Option<(u32, u32)>, @@ -159,8 +155,8 @@ impl ApiClient { let handle = tokio::spawn(async move { let result = run_session_generate( - session_lock, &client, chunks, sampling, priority, - readout_shape, &tx, + session_lock, &client, chunks, images, match_upto, sampling, + priority, readout_shape, &tx, ).await; if let Err(e) = result { log::warn!(target: "grpc", @@ -220,6 +216,8 @@ async fn run_session_generate( session_lock: std::sync::Arc>>, client: &ApiClient, chunks: Vec, + images: Vec, + match_upto: u32, sampling: SamplingParams, priority: Option, readout_shape: Option<(u32, u32)>, @@ -242,68 +240,69 @@ async fn run_session_generate( } }; - // Skip chunks already on the server. committed_len must land on - // a chunk boundary — every successful AppendImage / Generate - // advances committed_len by exactly one chunk's contribution, - // so straddling means divergence (client's AST was rewritten - // under us). - let mut acc: u32 = 0; - let mut delta_start = chunks.len(); - for (i, chunk) in chunks.iter().enumerate() { - if acc == handle.committed_len { - delta_start = i; - break; - } - let len = match chunk { - WireChunk::Tokens(t) => t.len() as u32, - WireChunk::Image { known_expanded_len, .. } => *known_expanded_len, - }; - if len == 0 { - anyhow::bail!( - "session divergence: chunk {} has unknown length but \ - precedes committed_len {} (acc={})", - i, handle.committed_len, acc, - ); - } - if acc + len > handle.committed_len { - anyhow::bail!( - "session divergence: chunk {} straddles committed_len \ - (acc={}, len={}, committed={})", - i, acc, len, handle.committed_len, - ); - } - acc += len; - } - if acc != handle.committed_len { - anyhow::bail!( - "session divergence: chunks sum to {} but committed_len is {}", - acc, handle.committed_len, - ); + // If the client believes the match extends only up to `match_upto` + // but the server has more, we need to rewind. For v1 the match is + // either whole or broken — `match_upto` is always 0 on any mutation + // — so the cheapest correct recovery is to drop the session and + // open a fresh one. + if match_upto < handle.committed_len { + log::warn!(target: "grpc", + "session rewind: match_upto={} < committed_len={} — reopening session (resending {} bytes)", + match_upto, handle.committed_len, handle.committed_len - match_upto); + drop(handle); + handle = salience::SessionHandle::open(client).await?; } - // Walk the delta: accumulate Tokens in `pending`; on Image, - // flush pending via prefill-only Generate then AppendImage. + // Walk chunks at byte-level, taking everything past `match_upto` + // as the delta. Token chunks can be split mid-way; images live + // inline in the token stream, so there's no separate image-chunk + // case anymore. + let mut acc: u32 = 0; let mut pending: Vec = Vec::new(); - for chunk in &chunks[delta_start..] { + for chunk in chunks.iter() { match chunk { - WireChunk::Tokens(t) => pending.extend_from_slice(t), - WireChunk::Image { bytes, mime, .. } => { - if !pending.is_empty() { - handle.prefill_only(std::mem::take(&mut pending)).await?; + WireChunk::Tokens(t) => { + let len = t.len() as u32; + let chunk_end = acc + len; + if chunk_end <= match_upto { + acc = chunk_end; + } else if acc < match_upto { + let skip = (match_upto - acc) as usize; + pending.extend_from_slice(&t[skip..]); + acc = chunk_end; + } else { + pending.extend_from_slice(t); + acc = chunk_end; } - let resp = handle - .append_image(bytes.clone(), mime.clone(), false) - .await?; - log::debug!(target: "grpc", - "AppendImage: N={} total_length={}", - resp.placeholder_count, resp.total_length); - let _ = tx.send(StreamToken::ImageAppended { - placeholder_count: resp.placeholder_count, - }); } } } + // Filter images to those entirely past `match_upto` — anything + // before is on the server already (prior turn), anything + // straddling is a hard divergence (image partially-sent shouldn't + // happen with our atomic AppendImage history; with images-inline + // it can only happen if mark_dirty cleared match_upto mid-block, + // which the AST mutators prevent). + let mut new_images: Vec = Vec::new(); + for img in &images { + if img.pad_end <= match_upto { + continue; // already sent on a prior turn + } + if img.pad_start < match_upto { + anyhow::bail!( + "session divergence: image at [{},{}) straddles match_upto={}", + img.pad_start, img.pad_end, match_upto, + ); + } + new_images.push(pb::ImageAttachment { + bytes: img.bytes.clone(), + mime: img.mime.clone(), + pad_range_start: img.pad_start, + pad_range_end: img.pad_end, + }); + } + // Final Generate: pending holds any trailing text; decode up to // sampling.max_tokens. Request readouts on all decode positions // via a catch-all range ending at u32::MAX — decode never @@ -331,6 +330,7 @@ async fn run_session_generate( top_k: sampling.top_k, stop_token_ids: Vec::new(), priority: priority.unwrap_or(0), + images: new_images, }; let session_id_for_log = handle.session_id.clone(); let t_generate = Instant::now(); diff --git a/src/agent/api/salience.rs b/src/agent/api/salience.rs index bba950f..f5f65d2 100644 --- a/src/agent/api/salience.rs +++ b/src/agent/api/salience.rs @@ -94,6 +94,8 @@ pub struct SessionHandle { impl SessionHandle { pub async fn open(client: &super::ApiClient) -> Result { + let t0 = std::time::Instant::now(); + log::debug!(target: "grpc", "OpenSession rpc: start"); let mut c = client.salience_client().await?; let mut req = tonic::Request::new(pb::OpenSessionRequest { model: client.model.clone(), @@ -105,8 +107,8 @@ impl SessionHandle { .with_context(|| "OpenSession RPC failed")? .into_inner(); log::debug!(target: "grpc", - "SessionHandle::open session_id={} max_model_len={}", - resp.session_id, resp.max_model_len); + "OpenSession rpc: done session_id={} max_model_len={} elapsed={:?}", + resp.session_id, resp.max_model_len, t0.elapsed()); Ok(Self { session_id: resp.session_id, max_model_len: resp.max_model_len, @@ -117,30 +119,21 @@ impl SessionHandle { pub fn client(&self) -> &super::ApiClient { &self.client } - /// Append an image via the server-side vision block. Updates - /// `committed_len` from the server's response on success. - pub async fn append_image( - &mut self, - data: Vec, - mime: String, - truncating: bool, - ) -> Result { + /// Debug-only: fetch the server's full session.tokens. Used to + /// verify client-side accounting byte-for-byte when divergence + /// is suspected. Not cheap on large sessions. + pub async fn dump_tokens(&self) -> Result> { let mut c = self.client.salience_client().await?; - let mut req = tonic::Request::new(pb::AppendImageRequest { + let mut req = tonic::Request::new(pb::DumpSessionRequest { session_id: self.session_id.clone(), - data, - mime, - offset: self.committed_len, - truncating, }); with_auth(&mut req, self.client.api_key()); let resp = c - .append_image(req) + .dump_session(req) .await - .with_context(|| "AppendImage RPC failed")? + .with_context(|| "DumpSession RPC failed")? .into_inner(); - self.committed_len = resp.total_length; - Ok(resp) + Ok(resp.tokens) } /// Open a gRPC Generate stream with the given request. Caller @@ -151,6 +144,10 @@ impl SessionHandle { &self, req: pb::GenerateRequest, ) -> Result> { + let t0 = std::time::Instant::now(); + log::debug!(target: "grpc", + "Generate rpc: open-stream session={} offset={} append={} max_tokens={}", + self.session_id, req.offset, req.append_tokens.len(), req.max_tokens); let mut c = self.client.salience_client().await?; let mut req = tonic::Request::new(req); with_auth(&mut req, self.client.api_key()); @@ -158,6 +155,9 @@ impl SessionHandle { .generate(req) .await .with_context(|| "Generate RPC failed")?; + log::debug!(target: "grpc", + "Generate rpc: stream opened session={} open-latency={:?}", + self.session_id, t0.elapsed()); Ok(resp.into_inner()) } @@ -183,6 +183,7 @@ impl SessionHandle { top_k: 0, stop_token_ids: Vec::new(), priority: 0, + images: Vec::new(), }; let mut stream = self.generate(req).await?; while let Some(event) = stream.next().await { diff --git a/src/agent/context.rs b/src/agent/context.rs index 2982851..0a49e05 100644 --- a/src/agent/context.rs +++ b/src/agent/context.rs @@ -143,6 +143,13 @@ pub enum AstNode { /// Maps memory key → divergence score for this response. #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")] memory_scores: std::collections::BTreeMap, + /// Cached token stream for the subtree. When `Some`, wire-out + /// uses these bytes verbatim and skips recursion into children. + /// Populated by the response parser from the server's exact + /// stream; also computable from children as a fallback. Cleared + /// on any edit to a descendant. Not serialized — transient. + #[serde(skip, default)] + token_ids: Option>, }, } @@ -155,6 +162,14 @@ pub struct ContextState { journal: Vec, conversation: Vec, pub conversation_log: Option, + /// Length of the session's token stream on the server, as of the + /// last Done event. Updated by the grpc layer. + server_committed_len: u32, + /// Prefix length of our walk that still matches the server's + /// session.tokens byte-for-byte. When < `server_committed_len` + /// the session needs rewinding (truncating=true at this offset). + /// Reset to 0 on any mutation that could have changed sent bytes. + client_match_upto: u32, } impl Clone for ContextState { @@ -165,6 +180,8 @@ impl Clone for ContextState { journal: self.journal.clone(), conversation: self.conversation.clone(), conversation_log: None, // forked contexts don't log + server_committed_len: self.server_committed_len, + client_match_upto: self.client_match_upto, } } } @@ -201,6 +218,10 @@ pub struct ResponseParser { think_buf: String, in_tool_call: bool, tool_call_buf: String, + /// Raw generated token IDs, in arrival order. Combined with the + /// prologue at `finish` to stamp the Branch's authoritative + /// token cache — the bytes the server has for this branch. + generated_tokens: Vec, } impl Role { @@ -369,8 +390,11 @@ impl AstNode { mime: impl Into, orig_height: u32, orig_width: u32, - token_count: u32, ) -> Self { + // Pad count is computed eagerly from dimensions — no more + // "unknown until server responds" shape. Server validates + // on the Generate call; mismatches fail loud. + let token_count = qwen3_image_token_count(orig_height, orig_width); Self::Leaf(NodeLeaf::new(NodeBody::Image { bytes, mime: mime.into(), @@ -383,7 +407,13 @@ impl AstNode { // -- Branch constructors -------------------------------------------------- pub fn branch(role: Role, children: Vec) -> Self { - Self::Branch { role, children, timestamp: Utc::now(), memory_scores: Default::default() } + Self::Branch { + role, + children, + timestamp: Utc::now(), + memory_scores: Default::default(), + token_ids: None, + } } pub fn system_msg(text: impl Into) -> Self { @@ -392,6 +422,7 @@ impl AstNode { children: vec![Self::content(text)], timestamp: Utc::now(), memory_scores: Default::default(), + token_ids: None, } } @@ -401,6 +432,7 @@ impl AstNode { children: vec![Self::content(text)], timestamp: Utc::now(), memory_scores: Default::default(), + token_ids: None, } } @@ -412,11 +444,12 @@ impl AstNode { let token_ids = leaf.body.compute_token_ids(); Self::Leaf(NodeLeaf { token_ids, ..leaf }) } - Self::Branch { role, children, timestamp, memory_scores } => Self::Branch { + Self::Branch { role, children, timestamp, memory_scores, .. } => Self::Branch { role, children: children.into_iter().map(|c| c.retokenize()).collect(), timestamp, memory_scores, + token_ids: None, }, } } @@ -493,7 +526,10 @@ impl AstNode { fn token_ids_into(&self, out: &mut Vec) { match self { Self::Leaf(leaf) => out.extend_from_slice(&leaf.token_ids), - Self::Branch { role, children, .. } => { + Self::Branch { token_ids: Some(cached), .. } => { + out.extend_from_slice(cached); + } + Self::Branch { role, children, token_ids: None, .. } => { out.push(tokenizer::IM_START); out.extend(tokenizer::encode(&format!("{}\n", role.as_str()))); for child in children { @@ -522,7 +558,8 @@ impl Ast for AstNode { fn tokens(&self) -> usize { match self { Self::Leaf(leaf) => leaf.tokens(), - Self::Branch { role, children, .. } => { + Self::Branch { token_ids: Some(cached), .. } => cached.len(), + Self::Branch { role, children, token_ids: None, .. } => { 1 + role_header_tokens(*role) + children.iter().map(|c| c.tokens()).sum::() + 1 + newline_tokens() @@ -676,6 +713,7 @@ impl ResponseParser { think_buf: String::new(), in_tool_call: false, tool_call_buf: String::new(), + generated_tokens: Vec::new(), } } @@ -706,6 +744,7 @@ impl ResponseParser { buf.push(id, r); } } + parser.generated_tokens.push(id); let text = super::tokenizer::decode(&[id]); full_text.push_str(&text); let mut ctx = agent.context.lock().await; @@ -740,22 +779,16 @@ impl ResponseParser { let _ = writeln!(f, " unparsed text: {}", &full_text[..end]); } } - if let Some(u) = usage { + if let Some(ref u) = usage { agent.state.lock().await.last_prompt_tokens = u.prompt_tokens; } let mut ctx = agent.context.lock().await; parser.finish(&mut ctx); + if let Some(u) = usage { + ctx.note_session_synced(u.total_tokens); + } return Ok(()); } - super::api::StreamToken::ImageAppended { placeholder_count } => { - // Commit the server-authoritative IMAGE_PAD - // count into the first zero-count image leaf - // in wire order. AppendImage always runs - // before the final Generate, so this fires - // before any Token events for this stream. - let mut ctx = agent.context.lock().await; - ctx.commit_image_token_counts(&[placeholder_count]); - } super::api::StreamToken::Error(e) => { return Err(anyhow::anyhow!("{}", e)); } @@ -842,7 +875,7 @@ impl ResponseParser { } fn push_child(&self, ctx: &mut ContextState, child: AstNode) { - ctx.push_child(Section::Conversation, self.branch_idx, child); + ctx.push_child_raw(Section::Conversation, self.branch_idx, child); } fn flush_content(&mut self, ctx: &mut ContextState) { @@ -860,6 +893,29 @@ impl ResponseParser { self.content_parts.push(std::mem::take(&mut self.buf)); } self.flush_content(ctx); + + // Stamp the authoritative token cache onto the branch. + // Layout mirrors the full chat-template rendering of a + // message block: + // + // IM_START + "assistant\n" [+ "\n"] (prologue — what we sent) + // + generated_tokens (what the server generated, ends in IM_END) + // + "\n" (trailing newline — template-required) + // + // Server only has through the IM_END (model stops on it, + // doesn't emit "\n"). Match-upto lands inside the cache + // right after IM_END; the chunk-walk's straddle path picks + // up the trailing "\n" as the head of the next turn's delta. + // The "\n" between turns matters: without it Qwen sees + // `<|im_end|><|im_start|>` back-to-back (no newline) and + // responds with garbage. + let prologue_text = if self.in_think { "assistant\n\n" } else { "assistant\n" }; + let mut cache = Vec::with_capacity(1 + self.generated_tokens.len() + 8); + cache.push(tokenizer::IM_START); + cache.extend(tokenizer::encode(prologue_text)); + cache.extend(self.generated_tokens); + cache.extend(tokenizer::encode("\n")); + ctx.set_branch_cache(Section::Conversation, self.branch_idx, cache); } } @@ -871,9 +927,39 @@ impl ContextState { journal: Vec::new(), conversation: Vec::new(), conversation_log: None, + server_committed_len: 0, + client_match_upto: 0, } } + // -- Server sync tracking ------------------------------------------------- + + /// Length of the session's token stream on the server. Updated by + /// the grpc layer from Generate Done events. + pub fn server_committed_len(&self) -> u32 { self.server_committed_len } + + /// Prefix of our walk we still believe matches the server + /// byte-for-byte. If less than `server_committed_len`, the next + /// Generate must send `truncating=true` at this offset. + pub fn client_match_upto(&self) -> u32 { self.client_match_upto } + + /// Called by the grpc layer after a successful Generate Done: + /// records both the server's new length and the fact that we + /// match up to it (we just sent everything). + pub fn note_session_synced(&mut self, total_tokens: u32) { + self.server_committed_len = total_tokens; + self.client_match_upto = total_tokens; + } + + /// Reset match-upto to 0. Called from every mutation that could + /// have touched a region the server already has. For now, + /// conservatively drops alignment entirely — finer-grained + /// tracking (match-upto at the mutated node's offset) is a + /// future optimization. + fn mark_dirty(&mut self) { + self.client_match_upto = 0; + } + // -- Read access ---------------------------------------------------------- pub fn system(&self) -> &[AstNode] { &self.system } @@ -886,35 +972,6 @@ impl ContextState { [&self.system, &self.identity, &self.journal, &self.conversation] } - /// Walk image leaves across all sections in wire order and fill in - /// the first N leaves that have `token_count == 0` with successive - /// values from `counts`. Used after a gRPC session's stream of - /// AppendImage responses to commit the server's IMAGE_PAD counts - /// back into the AST so the next wire walk doesn't see zero-count - /// images in the already-committed prefix. - pub fn commit_image_token_counts(&mut self, counts: &[u32]) { - fn visit(node: &mut AstNode, counts: &[u32], idx: &mut usize) { - if *idx >= counts.len() { return; } - match node { - AstNode::Leaf(leaf) => { - if let NodeBody::Image { token_count, .. } = leaf.body() { - if *token_count == 0 { - leaf.set_image_token_count(counts[*idx]); - *idx += 1; - } - } - } - AstNode::Branch { children, .. } => { - for c in children { visit(c, counts, idx); } - } - } - } - let mut idx = 0usize; - for node in &mut self.system { visit(node, counts, &mut idx); } - for node in &mut self.identity { visit(node, counts, &mut idx); } - for node in &mut self.journal { visit(node, counts, &mut idx); } - for node in &mut self.conversation { visit(node, counts, &mut idx); } - } } impl Ast for ContextState { @@ -947,55 +1004,57 @@ impl Ast for ContextState { } /// An image collected from the AST for a request body. The AST stores -/// the pre-expanded token form (`<|vision_start|> + <|image_pad|>×N + -/// <|vision_end|>`), and the wire form mirrors that exactly so the -/// server's `session.tokens` length matches what vLLM's engine will -/// process. The authoritative N is obtained from the server via the -/// CountImageTokens RPC before the Image leaf is constructed. +/// Image metadata collected during `wire_chunks` — the binary + +/// mime plus the absolute token-position range of the image's +/// pre-expanded placeholder run in the full wire stream. Sent +/// alongside `append_tokens` in `GenerateRequest` so the server +/// can attach vision features to the declared positions. Positions +/// are absolute within the full wire walk starting at offset 0, +/// i.e. the same coordinate system as `session.tokens` on the +/// server once the walk has been applied. #[derive(Clone)] pub struct WireImage { pub bytes: Vec, pub mime: String, + pub pad_start: u32, + pub pad_end: u32, } -/// One piece of the wire stream for the gRPC session path. Runs of -/// text/tool/thinking tokens are batched into `Tokens`; each Image -/// leaf becomes its own `Image` chunk because the server writes the -/// full vision block on AppendImage — the client never sends vision -/// tokens inline. Order matches the AST's depth-first wire order. +/// One piece of the wire stream for the gRPC session path. Since +/// images now live inline in the token stream (pre-expanded at AST +/// construction time), there's only one variant — a run of tokens. +/// The parallel `Vec` returned by `wire_chunks` gives the +/// binary + position metadata for each embedded image. #[derive(Clone)] pub enum WireChunk { Tokens(Vec), - Image { - bytes: Vec, - mime: String, - /// Client's current best guess at how many tokens the server - /// will expand this image to, including bookends. `0` means - /// the count is unknown (view_image just loaded the image and - /// AppendImage hasn't run yet). Callers use this only to know - /// this chunk's contribution to the server-visible length for - /// offset bookkeeping on chunks that were already appended on - /// a prior turn. - known_expanded_len: u32, - }, } fn wire_into(node: &AstNode, tokens: &mut Vec, images: &mut Vec) { match node { AstNode::Leaf(leaf) => match leaf.body() { NodeBody::Image { bytes, mime, .. } => { - // Send the pre-expanded token form (includes N - // <|image_pad|> tokens); engine's multi_modal - // pipeline pairs them with the binary data below. + // The Image leaf's token_ids is already + // [VISION_START, IMAGE_PAD * N, VISION_END]. Inline + // those into the token stream and record the pad-run + // range so the server can attach features to the + // declared positions. + let pad_start = tokens.len() as u32; tokens.extend_from_slice(leaf.token_ids()); + let pad_end = tokens.len() as u32; images.push(WireImage { bytes: bytes.clone(), mime: mime.clone(), + pad_start, + pad_end, }); } _ => tokens.extend_from_slice(leaf.token_ids()), }, - AstNode::Branch { role, children, .. } => { + AstNode::Branch { token_ids: Some(cached), .. } => { + tokens.extend_from_slice(cached); + } + AstNode::Branch { role, children, token_ids: None, .. } => { tokens.push(tokenizer::IM_START); tokens.extend(tokenizer::encode(&format!("{}\n", role.as_str()))); for c in children { @@ -1118,10 +1177,16 @@ impl ContextState { } /// Build the wire stream as interleaved `WireChunk`s for the gRPC - /// session path. Unlike `wire_prompt`, this preserves the order - /// of text runs vs image blocks so the caller can drive the - /// append flow (AppendImage for each Image, Generate append for - /// contiguous text runs). + /// session path. Returns a tuple of (chunks, images): the chunks + /// hold the full token stream (with vision blocks inlined as + /// `VISION_START + IMAGE_PAD*N + VISION_END`), and the images + /// list carries each embedded image's binary + position range so + /// the gRPC layer can attach them via `GenerateRequest.images`. + /// + /// Note: with images inlined into the token stream, the chunks + /// list is structurally a single `Tokens` chunk in the common + /// case — the multi-chunk shape persists only because some + /// callers may want the option of inserting breakpoints later. /// /// `conv_range` and `skip` mirror `wire_prompt` — select a /// conversation slice and drop identity / conversation nodes by @@ -1130,46 +1195,43 @@ impl ContextState { &self, conv_range: std::ops::Range, mut skip: F, - ) -> Vec + ) -> (Vec, Vec) where F: FnMut(&AstNode) -> bool, { - let mut out: Vec = Vec::new(); let mut buf: Vec = Vec::new(); + let mut images: Vec = Vec::new(); - fn flush(buf: &mut Vec, out: &mut Vec) { - if !buf.is_empty() { - out.push(WireChunk::Tokens(std::mem::take(buf))); - } - } - - fn visit(node: &AstNode, buf: &mut Vec, out: &mut Vec) { + fn visit( + node: &AstNode, + buf: &mut Vec, + images: &mut Vec, + ) { match node { AstNode::Leaf(leaf) => match leaf.body() { - NodeBody::Image { bytes, mime, token_count, .. } => { - flush(buf, out); - // Bookends (VISION_START + VISION_END) add 2 - // to the expanded length; token_count is the - // IMAGE_PAD run. 0 means count is still - // unknown (no AppendImage yet) — don't claim - // a length the server will disagree with. - let expanded = if *token_count == 0 { - 0 - } else { - *token_count + 2 - }; - out.push(WireChunk::Image { + NodeBody::Image { bytes, mime, .. } => { + // Pre-expanded vision block lives in + // leaf.token_ids: [VISION_START, IMAGE_PAD*N, + // VISION_END]. Inline + record the range. + let pad_start = buf.len() as u32; + buf.extend_from_slice(leaf.token_ids()); + let pad_end = buf.len() as u32; + images.push(WireImage { bytes: bytes.clone(), mime: mime.clone(), - known_expanded_len: expanded, + pad_start, + pad_end, }); } _ => buf.extend_from_slice(leaf.token_ids()), }, - AstNode::Branch { role, children, .. } => { + AstNode::Branch { token_ids: Some(cached), .. } => { + buf.extend_from_slice(cached); + } + AstNode::Branch { role, children, token_ids: None, .. } => { buf.push(tokenizer::IM_START); buf.extend(tokenizer::encode(&format!("{}\n", role.as_str()))); for c in children { - visit(c, buf, out); + visit(c, buf, images); } buf.push(tokenizer::IM_END); buf.extend(tokenizer::encode("\n")); @@ -1177,18 +1239,22 @@ impl ContextState { } } - for node in self.system() { visit(node, &mut buf, &mut out); } + for node in self.system() { visit(node, &mut buf, &mut images); } for node in self.identity() { if skip(node) { continue; } - visit(node, &mut buf, &mut out); + visit(node, &mut buf, &mut images); } - for node in self.journal() { visit(node, &mut buf, &mut out); } + for node in self.journal() { visit(node, &mut buf, &mut images); } for node in &self.conversation()[conv_range] { if skip(node) { continue; } - visit(node, &mut buf, &mut out); + visit(node, &mut buf, &mut images); } - flush(&mut buf, &mut out); - out + let chunks = if buf.is_empty() { + Vec::new() + } else { + vec![WireChunk::Tokens(buf)] + }; + (chunks, images) } } @@ -1209,17 +1275,27 @@ impl ContextState { dbglog!("warning: log: {:#}", e); } } + // Conversation appends always go to the tail — past committed — + // so they don't break the match. Any other section mutates a + // region the server may already have, so drop alignment. + if section != Section::Conversation { + self.mark_dirty(); + } self.section_mut(section).push(node); } /// Push without logging. pub fn push_no_log(&mut self, section: Section, node: AstNode) { + if section != Section::Conversation { + self.mark_dirty(); + } self.section_mut(section).push(node); } /// Replace the body of a leaf at `index` in `section`. /// Re-tokenizes to maintain the invariant. pub fn set_message(&mut self, section: Section, index: usize, body: NodeBody) { + self.mark_dirty(); let nodes = self.section_mut(section); let node = &mut nodes[index]; match node { @@ -1245,10 +1321,12 @@ impl ContextState { } pub fn del(&mut self, section: Section, index: usize) -> AstNode { + self.mark_dirty(); self.section_mut(section).remove(index) } pub fn clear(&mut self, section: Section) { + self.mark_dirty(); self.section_mut(section).clear(); } @@ -1269,6 +1347,7 @@ impl ContextState { /// are > 50% of conversation tokens) or oldest conversation entry. /// Phase 3: Snap to user message boundary at start. pub fn trim_conversation(&mut self) { + self.mark_dirty(); let max_tokens = context_budget_tokens(); let fixed = self.system.iter().map(|n| n.tokens()).sum::() + self.identity.iter().map(|n| n.tokens()).sum::() @@ -1345,11 +1424,49 @@ impl ContextState { } /// Push a child node into a branch at `index` in `section`. + /// Clears the branch's cached token stream — wire-out will recompute + /// from children until the cache is repopulated. If the cache was + /// populated (server had these bytes), drops session alignment. pub fn push_child(&mut self, section: Section, index: usize, child: AstNode) { + let node = &mut self.section_mut(section)[index]; + let was_cached = matches!(node, AstNode::Branch { token_ids: Some(_), .. }); + match node { + AstNode::Branch { children, token_ids, .. } => { + children.push(child); + *token_ids = None; + } + AstNode::Leaf(_) => panic!("push_child on leaf node"), + } + if was_cached { + self.mark_dirty(); + } + } + + /// Like `push_child` but preserves the branch's cached token stream. + /// Used by the response parser, which is simultaneously populating + /// the cache from the authoritative server stream and pushing the + /// parsed-out children — the two stay consistent by construction. + /// Module-private: callers outside `context.rs` must go through + /// `push_child` so the invariant is maintained. + fn push_child_raw(&mut self, section: Section, index: usize, child: AstNode) { let node = &mut self.section_mut(section)[index]; match node { AstNode::Branch { children, .. } => children.push(child), - AstNode::Leaf(_) => panic!("push_child on leaf node"), + AstNode::Leaf(_) => panic!("push_child_raw on leaf node"), + } + } + + /// Stamp a verbatim token cache onto the branch at `index` in + /// `section`. Used by the response parser to record the server's + /// authoritative token stream for the just-finished turn. + /// Module-private: the cache is an invariant-load-bearing piece + /// of state, populated only by code that holds the server's + /// ground truth. + fn set_branch_cache(&mut self, section: Section, index: usize, tokens: Vec) { + let node = &mut self.section_mut(section)[index]; + match node { + AstNode::Branch { token_ids, .. } => *token_ids = Some(tokens), + AstNode::Leaf(_) => panic!("set_branch_cache on leaf node"), } } @@ -1373,20 +1490,19 @@ impl ContextState { // to at request time. Constants come from Qwen3.5-27B's preprocessor_config. // --------------------------------------------------------------------------- -// Test-only client-side estimate of image token expansion. Production -// callers obtain the authoritative count from the server via -// CountImageTokens; these constants and helpers stay around only to -// keep the context-shape unit tests self-contained. -#[cfg(test)] +// Production client-side computation of image-token expansion. With +// the delta-session protocol, the client writes the pre-expanded +// vision block (VISION_START + N*IMAGE_PAD + VISION_END) directly +// into the token stream at Image-leaf construction time, and tells +// the server where each image's pad run lives via +// GenerateRequest.images. Server validates that this N matches +// what the vision encoder actually produces and rejects on +// mismatch — so drift here fails loudly, not silently. const QWEN3_PATCH_SIZE: u32 = 16; -#[cfg(test)] const QWEN3_MERGE_SIZE: u32 = 2; -#[cfg(test)] const QWEN3_MIN_PIXELS: u64 = 65_536; -#[cfg(test)] const QWEN3_MAX_PIXELS: u64 = 16_777_216; -#[cfg(test)] fn smart_resize(h: u32, w: u32, factor: u32, min_pixels: u64, max_pixels: u64) -> (u32, u32) { let max_s = h.max(w) as f64; let min_s = h.min(w) as f64; @@ -1415,11 +1531,10 @@ fn smart_resize(h: u32, w: u32, factor: u32, min_pixels: u64, max_pixels: u64) - } } -/// Test-only: client-side estimate of how many `<|image_pad|>` tokens -/// vLLM will emit for an image of the given dimensions. Production -/// callers use `salience::count_image_tokens` (server-authoritative). -#[cfg(test)] -fn qwen3_image_token_count(orig_h: u32, orig_w: u32) -> u32 { +/// How many `<|image_pad|>` tokens the Qwen3-VL vision encoder will +/// produce for an image of the given dimensions. Server verifies +/// this count against its own encoder run and rejects on mismatch. +pub fn qwen3_image_token_count(orig_h: u32, orig_w: u32) -> u32 { let factor = QWEN3_PATCH_SIZE * QWEN3_MERGE_SIZE; let (rh, rw) = smart_resize(orig_h, orig_w, factor, QWEN3_MIN_PIXELS, QWEN3_MAX_PIXELS); (rh / QWEN3_PATCH_SIZE) * (rw / QWEN3_PATCH_SIZE) / (QWEN3_MERGE_SIZE * QWEN3_MERGE_SIZE) @@ -1854,7 +1969,7 @@ mod tests { #[test] fn test_image_render_and_token_ids() { - let node = AstNode::image(vec![0u8, 1, 2, 3], "image/png", 512, 512, qwen3_image_token_count(512, 512)); + let node = AstNode::image(vec![0u8, 1, 2, 3], "image/png", 512, 512); let leaf = node.leaf().unwrap(); // 3 tokens of bookend + 256 image_pad tokens assert_eq!(leaf.token_ids().len(), 258); @@ -1874,7 +1989,7 @@ mod tests { let mut ctx = ContextState::new(); ctx.push_no_log(Section::Conversation, AstNode::branch(Role::User, vec![ AstNode::content("look:"), - AstNode::image(vec![0xDE, 0xAD], "image/png", 512, 512, qwen3_image_token_count(512, 512)), + AstNode::image(vec![0xDE, 0xAD], "image/png", 512, 512), ])); // AST side and wire side should both carry N image_pads + bookends — @@ -1904,7 +2019,7 @@ mod tests { #[test] fn test_image_serde_roundtrip() { - let node = AstNode::image(vec![0xDE, 0xAD, 0xBE, 0xEF], "image/png", 64, 64, qwen3_image_token_count(64, 64)); + let node = AstNode::image(vec![0xDE, 0xAD, 0xBE, 0xEF], "image/png", 64, 64); let json = serde_json::to_string(&node).unwrap(); // bytes must be base64-encoded in the JSON form assert!(json.contains("3q2+7w==")); diff --git a/src/agent/mod.rs b/src/agent/mod.rs index 613b106..1db40b1 100644 --- a/src/agent/mod.rs +++ b/src/agent/mod.rs @@ -333,14 +333,16 @@ impl Agent { /// becomes its own chunk. Also trims the conversation to budget /// first so we don't build a prompt the server will reject for /// length. - pub async fn assemble_prompt(&self) -> Vec { + pub async fn assemble_prompt(&self) + -> (Vec, Vec, u32) + { let mut ctx = self.context.lock().await; if ctx.total_tokens() > context::context_budget_tokens() { ctx.trim_conversation(); } let st = self.state.lock().await; let conv_len = ctx.conversation().len(); - let mut chunks = ctx.wire_chunks(0..conv_len, |_| false); + let (mut chunks, images) = ctx.wire_chunks(0..conv_len, |_| false); // Assistant-turn prologue. Merge into the trailing Tokens // chunk if there is one, else push as a new chunk. let mut prologue = vec![tokenizer::IM_START]; @@ -353,7 +355,8 @@ impl Agent { Some(context::WireChunk::Tokens(last)) => last.extend(prologue), _ => chunks.push(context::WireChunk::Tokens(prologue)), } - chunks + let match_upto = ctx.client_match_upto(); + (chunks, images, match_upto) } /// Rebuild the tools section of the system prompt from the current tools list. @@ -413,7 +416,7 @@ impl Agent { let _thinking = start_activity(&agent, "thinking...").await; let (rx, _stream_guard) = { - let chunks = agent.assemble_prompt().await; + let (chunks, images, match_upto) = agent.assemble_prompt().await; let st = agent.state.lock().await; let readout_shape = agent.readout.lock().ok().and_then(|buf| { buf.manifest.as_ref().map(|m| { @@ -423,6 +426,8 @@ impl Agent { agent.client.stream_session_mm( agent.grpc_session.clone(), chunks, + images, + match_upto, st.sampling, st.priority, readout_shape, diff --git a/src/agent/tools/vision.rs b/src/agent/tools/vision.rs index d122384..aede258 100644 --- a/src/agent/tools/vision.rs +++ b/src/agent/tools/vision.rs @@ -63,7 +63,7 @@ async fn view_image( // AppendImage (the server is authoritative for the IMAGE_PAD // count). Placeholder of 0 here until AppendImage is wired; the // leaf's count gets rewritten from the RPC response at send time. - let image_leaf = AstNode::image(bytes.clone(), mime, h, w, 0); + let image_leaf = AstNode::image(bytes.clone(), mime, h, w); let branch = AstNode::branch(Role::User, vec![image_leaf]); agent.context.lock().await.push_log(Section::Conversation, branch); diff --git a/src/mind/mod.rs b/src/mind/mod.rs index b2eb77a..9572272 100644 --- a/src/mind/mod.rs +++ b/src/mind/mod.rs @@ -693,7 +693,7 @@ impl Mind { } }); - let mut sub_handle: Option> = None; + let _sub_handle: Option> = None; // Start finetune scoring at startup (scores existing conversation) if !self.config.no_agents { @@ -743,6 +743,7 @@ impl Mind { _ = tokio::time::sleep(timeout), if !has_input => _dmn_expired = true, } + /* if !self.config.no_agents { if sub_handle.as_ref().map_or(true, |h| h.is_finished()) { let sub = self.subconscious.clone(); @@ -754,6 +755,7 @@ impl Mind { })); } } + */ // Check for pending user input → push to agent context and start turn let pending = self.shared.lock().unwrap().take_pending_input(); diff --git a/src/subconscious/generate.rs b/src/subconscious/generate.rs index 625b619..584d2c7 100644 --- a/src/subconscious/generate.rs +++ b/src/subconscious/generate.rs @@ -26,7 +26,7 @@ pub async fn gen_continuation( ) -> anyhow::Result where F: FnMut(&AstNode) -> bool, { - let mut chunks = context.wire_chunks(0..entry_idx, skip); + let (mut chunks, images) = context.wire_chunks(0..entry_idx, skip); // Assistant-turn prologue. let prologue = { @@ -50,19 +50,13 @@ where F: FnMut(&AstNode) -> bool, // `_guard` drops at function end. let session_lock = Arc::new(crate::Mutex::new(None)); let (mut rx, _guard) = client.stream_session_mm( - session_lock, chunks, sampling, Some(-5), None, + session_lock, chunks, images, 0, sampling, Some(-5), None, ); let mut tokens = Vec::new(); while let Some(tok) = rx.recv().await { match tok { StreamToken::Token { id, .. } => tokens.push(id), - StreamToken::ImageAppended { .. } => { - // subconscious/generate uses wire_chunks over an AST - // slice that shouldn't have unsized images — but if - // it ever does, we just don't care about updating the - // ephemeral session's AST view. - } StreamToken::Done { .. } => break, StreamToken::Error(e) => anyhow::bail!("generation error: {}", e), } diff --git a/src/subconscious/learn.rs b/src/subconscious/learn.rs index dca9b3c..feb209c 100644 --- a/src/subconscious/learn.rs +++ b/src/subconscious/learn.rs @@ -40,14 +40,15 @@ struct ScoreResult { total_logprob: f64, } -/// Convert a flat (prompt_tokens, images) pair into the interleaved -/// chunks the session protocol expects. Tokens up to the next -/// `<|vision_start|>` become a Tokens chunk; each -/// `<|vision_start|>..<|vision_end|>` run collapses into one Image -/// chunk paired by position with the next entry in `images`. The -/// server re-expands the IMAGE_PADs on AppendImage. -fn prompt_to_chunks(prompt: &[u32], images: &[WireImage]) -> Vec { - let mut out: Vec = Vec::new(); +/// Find each <|vision_start|>...<|vision_end|> run in the flat prompt +/// and pair it with the matching entry in `images`. Returns a list +/// of `ImageAttachment` with absolute pad-range positions, ready +/// to drop into `GenerateRequest.images`. +fn pair_images_to_ranges( + prompt: &[u32], + images: &[WireImage], +) -> Vec { + let mut out: Vec = Vec::new(); let mut cur = 0; let mut img_idx = 0; while cur < prompt.len() { @@ -60,22 +61,16 @@ fn prompt_to_chunks(prompt: &[u32], images: &[WireImage]) -> Vec { let img = images.get(img_idx) .unwrap_or_else(|| panic!( "image index {} out of range for {} images", img_idx, images.len())); - out.push(WireChunk::Image { + out.push(pb::ImageAttachment { bytes: img.bytes.clone(), mime: img.mime.clone(), - known_expanded_len: (end - cur) as u32, + pad_range_start: cur as u32, + pad_range_end: end as u32, }); img_idx += 1; cur = end; } else { - let next_vs = prompt[cur..].iter() - .position(|&t| t == tokenizer::VISION_START); - let end = match next_vs { - Some(o) => cur + o, - None => prompt.len(), - }; - out.push(WireChunk::Tokens(prompt[cur..end].to_vec())); - cur = end; + cur += 1; } } out @@ -95,36 +90,22 @@ async fn call_score( return Ok(Vec::new()); } - let chunks = prompt_to_chunks(prompt, images); + let images_pb = pair_images_to_ranges(prompt, images); let mut handle = SessionHandle::open(client).await?; - // Walk chunks: AppendImage for each image, prefill-only Generate - // for each text run between images. Accumulate any trailing text - // run into `pending` for the final logprob-generating Generate. - let mut pending: Vec = Vec::new(); - for chunk in chunks { - match chunk { - WireChunk::Tokens(t) => pending.extend(t), - WireChunk::Image { bytes, mime, .. } => { - if !pending.is_empty() { - handle.prefill_only(std::mem::take(&mut pending)).await?; - } - handle.append_image(bytes, mime, false).await?; - } - } - } - // Final Generate: max_tokens=0 so the server runs prefill of the - // trailing `pending` tokens and emits Token events for each - // position covered by logprobs_ranges, then Done. logprob_top_k=0 - // means "just the sampled (prompt) token's logprob" — no top-k - // alternatives, which is all call_score historically needed. + // full prompt and emits Token events for each position covered + // by logprobs_ranges, then Done. logprob_top_k=0 means "just + // the sampled (prompt) token's logprob" — no top-k alternatives, + // which is all call_score historically needed. Images attach + // inline via `images`; the prompt already contains their pre- + // expanded vision blocks at the declared ranges. let logprobs_ranges: Vec = ranges.iter() .map(|(s, e)| pb::PositionRange { start: *s as u32, end: *e as u32 }) .collect(); let req = pb::GenerateRequest { session_id: handle.session_id.clone(), - append_tokens: pending, + append_tokens: prompt.to_vec(), offset: handle.committed_len, truncating: false, max_tokens: 0, @@ -136,6 +117,7 @@ async fn call_score( top_k: 0, stop_token_ids: Vec::new(), priority: priority.unwrap_or(0), + images: images_pb, }; let mut stream = handle.generate(req).await?; diff --git a/src/user/context.rs b/src/user/context.rs index 17660b5..8edd926 100644 --- a/src/user/context.rs +++ b/src/user/context.rs @@ -43,6 +43,7 @@ impl ConsciousScreen { name: format!("mem: {}", key), tokens: node.tokens(), content: text.clone(), + token_ids: leaf.token_ids().to_vec(), children: Vec::new(), status: score.map(|s| format!("{:.2}", s)).unwrap_or_default(), }); @@ -55,6 +56,7 @@ impl ConsciousScreen { name: format!("Memory nodes ({})", mem_children.len()), tokens: mem_tokens, content: String::new(), + token_ids: Vec::new(), children: mem_children, status: format!("{} scored, {} unscored", scored, unscored), }); @@ -70,11 +72,13 @@ impl ConsciousScreen { AstNode::Leaf(leaf) => leaf.body().text().to_string(), _ => String::new(), }, + token_ids: node.token_ids(), children: match node { AstNode::Branch { children, .. } => children.iter() .map(|c| SectionView { name: c.label(), tokens: c.tokens(), content: match c { AstNode::Leaf(l) => l.body().text().to_string(), _ => String::new() }, + token_ids: match c { AstNode::Leaf(l) => l.token_ids().to_vec(), _ => c.token_ids() }, children: Vec::new(), status: String::new(), }).collect(), _ => Vec::new(), @@ -101,6 +105,7 @@ impl ConsciousScreen { name: format!("Conversation ({} entries)", conv_children.len()), tokens: conv_tokens, content: String::new(), + token_ids: Vec::new(), children: conv_children, status: String::new(), }); diff --git a/src/user/subconscious.rs b/src/user/subconscious.rs index c332ce6..c71642d 100644 --- a/src/user/subconscious.rs +++ b/src/user/subconscious.rs @@ -207,6 +207,7 @@ impl SubconsciousScreen { name: key.clone(), tokens: 0, content: val.clone(), + token_ids: Vec::new(), children: Vec::new(), status: String::new(), } @@ -238,6 +239,7 @@ impl SubconsciousScreen { name: format!("Conversation ({} entries)", conv_children.len()), tokens: conv_children.iter().map(|c| c.tokens).sum(), content: String::new(), + token_ids: Vec::new(), children: conv_children, status: String::new(), }); diff --git a/src/user/widgets.rs b/src/user/widgets.rs index 49f3e3b..6706a69 100644 --- a/src/user/widgets.rs +++ b/src/user/widgets.rs @@ -8,11 +8,18 @@ use ratatui::{ }; use crate::agent::context::{AstNode, Ast, NodeBody}; -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct SectionView { pub name: String, pub tokens: usize, pub content: String, + /// Token-id stream for this subtree, displayed in place of + /// `content` when the tree's show-tokens mode is on. Populated + /// from `leaf.token_ids()` / `node.token_ids()` for views built + /// from the AST; empty for views that don't have a corresponding + /// AST node (subconscious entries, etc.), in which case the + /// token view falls back to the text content. + pub token_ids: Vec, pub children: Vec, /// Extra status text shown after the token count. pub status: String, @@ -32,6 +39,7 @@ fn node_to_view(node: &AstNode) -> SectionView { name, tokens: node.tokens(), content: leaf.body().text().to_string(), + token_ids: leaf.token_ids().to_vec(), children: Vec::new(), status, } @@ -44,6 +52,7 @@ fn node_to_view(node: &AstNode) -> SectionView { name: node.label(), tokens: node.tokens(), content: String::new(), + token_ids: node.token_ids(), children: child_views, status: String::new(), } @@ -54,10 +63,12 @@ fn node_to_view(node: &AstNode) -> SectionView { pub fn section_to_view(name: &str, nodes: &[AstNode]) -> SectionView { let children: Vec = nodes.iter().map(|n| node_to_view(n)).collect(); let total_tokens: usize = nodes.iter().map(|n| n.tokens()).sum(); + let token_ids: Vec = nodes.iter().flat_map(|n| n.token_ids()).collect(); SectionView { name: name.to_string(), tokens: total_tokens, content: String::new(), + token_ids, children, status: String::new(), } @@ -104,7 +115,7 @@ pub fn format_ts_age(ts: i64) -> String { /// Key legend for SectionTree panes. pub fn tree_legend() -> Line<'static> { Line::styled( - " ↑↓:nav →/Enter:expand ←:collapse e:expand all c:collapse all PgUp/Dn Home/End ", + " ↑↓:nav →/Enter:expand ←:collapse e:expand c:collapse v:toggle tokens/text PgUp/Dn ", Style::default().fg(Color::DarkGray), ) } @@ -185,11 +196,19 @@ pub struct SectionTree { pub selected: Option, pub expanded: std::collections::HashSet, pub scroll: super::scroll_pane::ScrollPaneState, + /// When true, render `token_ids` as space-separated IDs in place + /// of `content` in expanded panels. Toggled with 'v'. + pub show_tokens: bool, } impl SectionTree { pub fn new() -> Self { - Self { selected: None, expanded: std::collections::HashSet::new(), scroll: super::scroll_pane::ScrollPaneState::new() } + Self { + selected: None, + expanded: std::collections::HashSet::new(), + scroll: super::scroll_pane::ScrollPaneState::new(), + show_tokens: false, + } } fn total_nodes(&self, sections: &[SectionView]) -> usize { @@ -264,6 +283,9 @@ impl SectionTree { KeyCode::Char('c') => { self.expanded.clear(); } + KeyCode::Char('v') => { + self.show_tokens = !self.show_tokens; + } _ => {} } self.scroll_to_selected(height); @@ -326,7 +348,12 @@ impl SectionTree { } } else if has_content { let content_indent = format!("{} │ ", " ".repeat(depth + 1)); - let content_lines: Vec<&str> = section.content.lines().collect(); + let body = if self.show_tokens && !section.token_ids.is_empty() { + format_token_ids_wrapped(§ion.token_ids) + } else { + section.content.clone() + }; + let content_lines: Vec<&str> = body.lines().collect(); let show = content_lines.len().min(50); for line in &content_lines[..show] { lines.push(Line::styled( @@ -344,3 +371,16 @@ impl SectionTree { } } } + +/// Format token IDs for the content panel: space-separated, wrapped +/// at 12 ids per line so they fit comfortably in a pane. +fn format_token_ids_wrapped(ids: &[u32]) -> String { + let mut out = String::new(); + for (i, id) in ids.iter().enumerate() { + if i > 0 { + if i % 12 == 0 { out.push('\n'); } else { out.push(' '); } + } + out.push_str(&id.to_string()); + } + out +} From 11a7e4043e0b3e58f56566b17b22e7b15a241e49 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Apr 2026 22:15:31 -0400 Subject: [PATCH 06/31] scripts: FP8 quantize Qwen3.6-27B for vLLM (multimodal + MTP) Quantization recipe targeting the multimodal Qwen3.6-27B for vLLM serving. Three pitfalls the script avoids, each documented inline: 1. Loader strip: `AutoModelForCausalLM` silently drops the vision tower; we load via the config-declared `Qwen3_5ForConditionalGeneration` instead. 2. Pattern anchor: llmcompressor matches the `ignore` list against module names (no `.weight` suffix) when walking `named_modules()`, not against full tensor names. Patterns now anchor on `$` at the module name; the earlier `\.weight$` form silently quantized lm_head and every linear_attn projection. 3. vLLM fusion: vLLM fuses {q,k,v}_proj into qkv_proj, gate+up into gate_up_proj, and in_proj_qkv+in_proj_z into in_proj_qkvz. The compressed_tensors loader rejects mixed schemes within a fused layer, so the `ignore` list is shaped to keep all sub-components of a fused layer consistent. After `oneshot()` writes the FP8 output, MTP tensors (which the HF class doesn't expose) are spliced in at BF16 from the upstream cached snapshot, with the compressed_tensors metadata header preserved. Recipe follows Unsloth's UD-Q8_K_XL late-stack overrides (FFN: 50, 51, 59, 62, 63; ATTN: 51, 59, 63), extended to include `v_proj` for fusion compat. Final checkpoint is ~35 GB (matches Unsloth's GGUF size to within ~1%) with vision tower BF16, MTP head BF16, and most mlp/self_attn Linears at FP8_DYNAMIC. Co-Authored-By: Proof of Concept --- scripts/quantize_qwen3_6_mm.py | 327 +++++++++++++++++++++++++++++++++ 1 file changed, 327 insertions(+) create mode 100644 scripts/quantize_qwen3_6_mm.py diff --git a/scripts/quantize_qwen3_6_mm.py b/scripts/quantize_qwen3_6_mm.py new file mode 100644 index 0000000..844571e --- /dev/null +++ b/scripts/quantize_qwen3_6_mm.py @@ -0,0 +1,327 @@ +"""Quantize Qwen3.6-27B (multimodal) to FP8 for vLLM serving. + +Why this exists +--------------- +The earlier `quantize_qwen3_6.py` (in shell history, never committed) +loaded the model with `AutoModelForCausalLM`, which silently strips +the multimodal arch. Result: an FP8 checkpoint with no vision tower +weights at all. vLLM happily instantiated the vision tower from the +config and ran it with default/uninitialized weights, producing +gibberish image features and `!!!!!!`-style output. We chased that +through the protocol layer for a long time before tracing it back +to the quant. This script avoids that trap by loading via the +config-declared class explicitly. + +Recipe +------ +FP8_DYNAMIC (per-channel weight scales, per-token dynamic activation +scales, both E4M3) for Linear weights, with an `ignore` list derived +from Unsloth's UD-Q8_K_XL (`unsloth/Qwen3.6-27B-GGUF`). Their +sensitivity sweep flagged specific layers as quantization-fragile; +we honor those layer indices even though their algorithm is +GGUF-native Q8_K and ours is FP8 — sensitivity is a layer property, +not an algorithm property. + +vLLM fusion constraint +~~~~~~~~~~~~~~~~~~~~~~ +vLLM's Qwen3.5/3.6 model code fuses sub-modules at load time: + qkv_proj ← q_proj, k_proj, v_proj + gate_up_proj ← gate_proj, up_proj + in_proj_qkvz ← in_proj_qkv, in_proj_z + in_proj_ba ← in_proj_b, in_proj_a +compressed_tensors rejects checkpoints where sub-modules of a fused +layer have different quantization schemes. Our ignore list is shaped +around this — within any fused layer, all components share a scheme. +That's the reason `in_proj_qkv` is ignored even though Unsloth's +sweep doesn't single it out, and the reason late-stack attn override +covers q/k/v rather than just q/k. + +MTP merge +--------- +`Qwen3_5ForConditionalGeneration` doesn't expose the MTP submodule, +so `oneshot()` produces a checkpoint with the 15 `mtp.*` tensors +silently dropped. After quantization we read the MTP weights back +out of the upstream cached snapshot and splice them into the saved +safetensors at BF16. They're small (~850 MB) so quantizing them +isn't worth the calibration risk; speculative-decoding code paths +in vLLM expect the MTP head present. + +Output +------ +`OUTPUT_DIR` gets the FP8 model.safetensors + config + processor + +recipe.yaml. Vision tower stays BF16 (in `ignore`); LM Linears go +to FP8; norms, SSM internals (not Linear), and MTP tensors stay +BF16 untouched. + +Verification at end: re-opens the saved safetensors and asserts +- vision .weight tensors present (>= 150; full count is 167) +- lm_head + embed_tokens at fp16/bf16 (NOT FP8) +- a sampled FP8'd Linear actually has float8 dtype +- 15 mtp.* tensors present + +Run +--- + ~/vllm-venv/bin/python quantize_qwen3_6_mm.py +""" +from __future__ import annotations + +import glob +import json +import sys +from pathlib import Path + +import torch +from huggingface_hub import snapshot_download +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier +from safetensors import safe_open +from safetensors.torch import save_file +from transformers import AutoProcessor +from transformers.models.qwen3_5.modeling_qwen3_5 import ( + Qwen3_5ForConditionalGeneration, +) + + +MODEL = "Qwen/Qwen3.6-27B" +OUTPUT_DIR = "/home/ubuntu/amygdala-training/Qwen3.6-27B-FP8-mm" + + +# Layers Unsloth's UD-Q8_K_XL keeps at F16 (perplexity-sensitive +# in their sweep). Late-stack clustering is consistent with the +# general finding that errors near the output propagate directly +# to logits. +LATE_FFN_LAYERS = (50, 51, 59, 62, 63) +LATE_ATTN_LAYERS = (51, 59, 63) + + +# Build the ignore regex list. Note: llmcompressor matches these +# patterns against MODULE names (no `.weight` suffix) when walking +# `named_modules()` for `targets=["Linear"]`. The first pass of +# this script used `\.weight$` patterns and silently quantized +# lm_head + every linear_attn projection — verified post-hoc by +# inspecting the saved safetensors. Patterns now anchor on `$` +# at the module name. +IGNORE_PATTERNS: list[str] = [ + # Original recipe: lm_head and embeddings always full-precision. + # (embed_tokens is an Embedding, not a Linear, so it's already + # ignored by `targets=["Linear"]`. Pattern kept as belt-and- + # suspenders in case future llmcompressor versions widen the + # target set.) + "re:lm_head$", + "re:.*embed_tokens$", + + # Vision tower — entire `model.visual.*` subtree (vision + # transformer blocks + merger + patch_embed + pos_embed). + # Unsloth ships the vision tower as a separate `mmproj-BF16.gguf` + # for GGUF consumers; in our single-file FP8 setup we just leave + # them at BF16. + "re:model\\.visual\\..*", + + # MTP (multi-token prediction) module — Unsloth's GGUF doesn't + # carry MTP weights so we have no precision signal from them; + # safest to keep BF16. + "re:mtp\\..*", + + # Linear-attention block — keep ENTIRELY at BF16. vLLM fuses + # `in_proj_qkv` and `in_proj_z` into a single `in_proj_qkvz` + # layer, and compressed_tensors rejects mixed schemes within a + # fused layer. Unsloth's recipe keeps z, a, b, out at F16/F32 + # (gate/SSM internals are quantization-fragile in the GatedDeltaNet + # update), so the principled choice is to also keep `in_proj_qkv` + # at BF16 rather than FP8'ing the gate to match. We give up ~1 GB + # of FP8 coverage; in exchange we follow Unsloth's quality intent + # and load cleanly under vLLM. (`in_proj_a` + `in_proj_b` are + # likewise fused as `in_proj_ba` — both ignored, consistent.) + "re:model\\.language_model\\.layers\\.\\d+\\.linear_attn\\.in_proj_qkv$", + "re:model\\.language_model\\.layers\\.\\d+\\.linear_attn\\.in_proj_z$", + "re:model\\.language_model\\.layers\\.\\d+\\.linear_attn\\.in_proj_a$", + "re:model\\.language_model\\.layers\\.\\d+\\.linear_attn\\.in_proj_b$", + "re:model\\.language_model\\.layers\\.\\d+\\.linear_attn\\.out_proj$", + + # Per-layer high-precision MLP (Unsloth flagged exactly these + # late-stack indices in their UD-Q8_K_XL sensitivity sweep, all + # three of {gate, up, down} per layer). vLLM fuses gate+up into + # `gate_up_proj`; ignoring both keeps the fused layer consistent. + # `down_proj` is its own (non-fused) layer. + "re:model\\.language_model\\.layers\\.(" + + "|".join(str(n) for n in LATE_FFN_LAYERS) + + ")\\.mlp\\.(down|gate|up)_proj$", + + # Per-layer high-precision attention q/k/v (Unsloth's sweep upgrades + # only q and k; we extend to v because vLLM fuses q/k/v into + # `qkv_proj` and rejects mixed schemes. `o_proj` is its own + # non-fused layer and stays at FP8. + "re:model\\.language_model\\.layers\\.(" + + "|".join(str(n) for n in LATE_ATTN_LAYERS) + + ")\\.self_attn\\.(q|k|v)_proj$", +] + + +def main() -> None: + print(f"Loading {MODEL} as multimodal " + f"(Qwen3_5ForConditionalGeneration)...", flush=True) + model = Qwen3_5ForConditionalGeneration.from_pretrained( + MODEL, + dtype=torch.bfloat16, + device_map="auto", + trust_remote_code=True, + ) + print(f" loaded: {model.__class__.__name__}", flush=True) + + print(f"Loading processor (text + image preprocessing)...", flush=True) + processor = AutoProcessor.from_pretrained(MODEL, trust_remote_code=True) + + print("Running FP8_DYNAMIC oneshot quantization...", flush=True) + print(f" ignore list: {len(IGNORE_PATTERNS)} patterns", + flush=True) + recipe = QuantizationModifier( + targets=["Linear"], + scheme="FP8_DYNAMIC", + ignore=IGNORE_PATTERNS, + ) + oneshot(model=model, recipe=recipe, output_dir=OUTPUT_DIR) + processor.save_pretrained(OUTPUT_DIR) + print(f" wrote model + processor to {OUTPUT_DIR}", flush=True) + + merge_mtp(OUTPUT_DIR) + verify_output(OUTPUT_DIR) + + +def merge_mtp(out_dir: str) -> None: + """Splice upstream MTP tensors into the saved FP8 safetensors. + + `Qwen3_5ForConditionalGeneration` skips the MTP submodule on load, + so oneshot's output is missing the 15 `mtp.*` tensors. We resolve + the upstream snapshot via the HF cache (already populated by + from_pretrained), pull just the MTP tensors out at BF16, and + rewrite the safetensors with them merged in. The compressed_tensors + metadata header (which carries the FP8 format identifier vLLM + needs to dequantize) is preserved verbatim. + + Atomic-rename is used so a crash mid-write doesn't corrupt the + 33+ GB checkpoint we just spent minutes producing. + """ + print("\nMerging upstream MTP tensors...", flush=True) + upstream_dir = Path(snapshot_download( + MODEL, + allow_patterns=["model.safetensors.index.json", + "model-*-of-*.safetensors"], + )) + + with open(upstream_dir / "model.safetensors.index.json") as f: + idx = json.load(f) + mtp_shards = sorted({v for k, v in idx["weight_map"].items() + if k.startswith("mtp.")}) + print(f" MTP tensors live in shards: {mtp_shards}", flush=True) + + mtp_tensors: dict[str, torch.Tensor] = {} + for shard in mtp_shards: + with safe_open(upstream_dir / shard, framework="pt") as f: + for k in f.keys(): + if k.startswith("mtp."): + mtp_tensors[k] = f.get_tensor(k).contiguous() + mtp_bytes = sum(t.numel() * t.element_size() + for t in mtp_tensors.values()) + print(f" loaded {len(mtp_tensors)} mtp tensors " + f"({mtp_bytes/1e6:.1f} MB)", flush=True) + + fp8_files = sorted(Path(out_dir).glob("*.safetensors")) + if len(fp8_files) != 1: + sys.exit(f"FAIL: expected single safetensors shard, " + f"got {fp8_files}") + existing_path = fp8_files[0] + + with safe_open(existing_path, framework="pt") as f: + metadata = f.metadata() or {} + all_tensors = {k: f.get_tensor(k) for k in f.keys()} + + overlap = set(all_tensors) & set(mtp_tensors) + if overlap: + sys.exit(f"FAIL: MTP key collision with FP8 output: " + f"{sorted(overlap)[:5]}") + all_tensors.update(mtp_tensors) + + tmp_path = existing_path.with_name(existing_path.name + ".new") + print(f" rewriting {existing_path.name} " + f"({len(all_tensors)} tensors)...", flush=True) + save_file(all_tensors, str(tmp_path), metadata=metadata) + tmp_path.replace(existing_path) + print(" done", flush=True) + + +def verify_output(out_dir: str) -> None: + """Open the saved safetensors and assert the recipe actually + landed: vision tower present at BF16, FP8 dtype on at least one + quantized Linear, lm_head not FP8.""" + print(f"\nVerifying {out_dir}...", flush=True) + + files = sorted(glob.glob(f"{out_dir}/*.safetensors")) + if not files: + sys.exit(f"FAIL: no safetensors in {out_dir}") + + vision_keys: list[tuple[str, str]] = [] + fp8_sample: tuple[str, str] | None = None + lm_head_dtype: str | None = None + mtp_keys: list[str] = [] + + for fp in files: + with safe_open(fp, framework="pt") as f: + for k in f.keys(): + if k.startswith("mtp."): + mtp_keys.append(k) + # Some FP8 quants write a sibling `_scale` / `_zero_point`; + # we just care about the .weight tensors. + if not k.endswith(".weight"): + continue + t = f.get_tensor(k) + dtype = str(t.dtype).replace("torch.", "") + if "model.visual." in k: + vision_keys.append((k, dtype)) + if k == "lm_head.weight": + lm_head_dtype = dtype + if (fp8_sample is None + and "float8" in dtype + and "language_model.layers" in k): + fp8_sample = (k, dtype) + + # Qwen3.6-27B has 167 vision `.weight` tensors (333 vision tensors + # total, the rest are `.bias` and per-block norms). 150 is a + # sanity floor that catches "vision tower didn't make it through" + # without being brittle to minor arch revisions. + if len(vision_keys) < 150: + sys.exit(f"FAIL: only {len(vision_keys)} vision tensors found " + f"(expected >= 150). Vision tower didn't make it " + f"through the quant.") + + bad_vision = [(k, d) for k, d in vision_keys if "float8" in d] + if bad_vision: + sys.exit(f"FAIL: vision weights got quantized to FP8: " + f"{bad_vision[:3]}...") + + if lm_head_dtype is None: + sys.exit("FAIL: lm_head.weight not found in output.") + if "float8" in lm_head_dtype: + sys.exit(f"FAIL: lm_head.weight is FP8 ({lm_head_dtype}); " + f"should be BF16/FP16.") + + if fp8_sample is None: + sys.exit("FAIL: no FP8 weights found in language_model.layers — " + "the recipe didn't quantize anything.") + + # Upstream Qwen3.6-27B has exactly 15 mtp.* tensors (1 fused + # transformer block + projection + norms). merge_mtp() should + # have spliced all of them in. + if len(mtp_keys) != 15: + sys.exit(f"FAIL: expected 15 mtp.* tensors, found " + f"{len(mtp_keys)}. merge_mtp() missed some.") + + print(f" ✓ {len(vision_keys)} vision tensors at " + f"{vision_keys[0][1]} (not FP8)") + print(f" ✓ lm_head.weight at {lm_head_dtype} (not FP8)") + print(f" ✓ FP8 sample: {fp8_sample[0]} = {fp8_sample[1]}") + print(f" ✓ {len(mtp_keys)} mtp.* tensors present") + print("DONE") + + +if __name__ == "__main__": + main() From 10c8878f1c2bd7d15d28126c1445e774fe63356b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Apr 2026 22:36:10 -0400 Subject: [PATCH 07/31] agent: bump tonic gRPC message caps to 64 MiB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default 4 MiB cap on encoded/decoded messages is too small for the multimodal Generate path: Qwen3.6-VL high-res patches put 5–8 MiB of pre-encoded image bytes inline in a single Generate request, and Done events carrying full per-token readout vectors can also exceed 4 MiB on long runs. Hit "ResourceExhausted: Received message larger than max (5799108 vs. 4194304)" from the salience server. Bump both encode and decode caps on every cloned SalienceClient. The matching server-side bump is in vllm/entrypoints/salience/server.py. Co-Authored-By: Proof of Concept --- src/agent/api/mod.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/agent/api/mod.rs b/src/agent/api/mod.rs index 5705d89..fc8a358 100644 --- a/src/agent/api/mod.rs +++ b/src/agent/api/mod.rs @@ -117,6 +117,12 @@ impl ApiClient { /// the channel on first call and reuses it thereafter across /// every ApiClient clone. All scoring / inference / session /// RPCs flow through this single multiplexed HTTP/2 connection. + /// + /// Bumps tonic's default 4 MiB encode/decode caps to 64 MiB on + /// every client. Multimodal Generate requests carry pre-encoded + /// image bytes inline (Qwen3.6's 768×768 patches at high res + /// land around 5–8 MiB per turn), and Done events with full + /// per-token readout vectors can also exceed 4 MiB on long runs. pub async fn salience_client(&self) -> Result< salience::pb::salience_client::SalienceClient > { @@ -127,7 +133,10 @@ impl ApiClient { self.base_url, grpc_url); salience::connect_channel(&grpc_url).await }).await?; - Ok(salience::pb::salience_client::SalienceClient::new(ch.clone())) + const MAX_GRPC_MESSAGE_BYTES: usize = 64 * 1024 * 1024; + Ok(salience::pb::salience_client::SalienceClient::new(ch.clone()) + .max_decoding_message_size(MAX_GRPC_MESSAGE_BYTES) + .max_encoding_message_size(MAX_GRPC_MESSAGE_BYTES)) } /// Stream generation via a gRPC session. Walks the prompt chunks From 006b99bdac13cec71131f627c3d6dd0722d3284a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Apr 2026 22:44:19 -0400 Subject: [PATCH 08/31] bin: enable panic backtraces by default stderr is redirected to ~/.consciousness/logs/tui-stderr.log via redirect_stderr_to_pipe(), but the default panic hook checks RUST_BACKTRACE before printing the trace; without the env var the log only catches the "note: run with \`RUST_BACKTRACE=full\`" tail and the actual frames are dropped. Set RUST_BACKTRACE=1 programmatically before any other thread spawns so the log captures the trace by default. Existing user-set value is respected so callers can still opt into "full" if they want. Co-Authored-By: Proof of Concept --- src/bin/consciousness.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/bin/consciousness.rs b/src/bin/consciousness.rs index 2fcfebf..61d28e1 100644 --- a/src/bin/consciousness.rs +++ b/src/bin/consciousness.rs @@ -2,6 +2,17 @@ #![warn(unreachable_pub)] fn main() { + // Force the default panic hook to print a backtrace. stderr is + // already redirected to a daemon log; without this the hook obeys + // RUST_BACKTRACE (unset by default), so the log only shows the + // "note: run with `RUST_BACKTRACE=full`" tail and the actual + // frames are lost. + // + // SAFETY: called before any other thread is spawned, so no + // concurrent env reader can race. + if std::env::var_os("RUST_BACKTRACE").is_none() { + unsafe { std::env::set_var("RUST_BACKTRACE", "1"); } + } std::panic::set_backtrace_style(std::panic::BacktraceStyle::Short); consciousness::user::main() } From c2433c17739d0a5cdc7753eec0fe6d429e32a7db Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Apr 2026 23:15:55 -0400 Subject: [PATCH 09/31] context: tighten the Branch token-cache invariant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two pieces around the cache that landed when Branch nodes started holding `token_ids: Some(server_authoritative_stream)`: 1. wire_into / wire_chunks now pair cached vision blocks with their child Image leaves. Previously the cached-branch arm spliced the cache verbatim and didn't recurse for images, so a Branch whose cache contained `VISION_START..VISION_END` blocks would emit those tokens with no matching `WireImage` push — leading to a panic downstream when `pair_images_to_ranges` tried to attach the missing image. New `pair_cached_images` walks the children depth-first for image leaves and zips them against `vision_blocks(cache)` to emit correctly-offset entries; mismatched counts panic loudly because that's an AST/cache invariant violation that would otherwise mis-pair on the wire. 2. `conversation_mut() -> &mut Vec` was the one public escape hatch that let callers reach into a Branch's children and mutate them without invalidating the cached token stream. Removed in favor of a focused `set_branch_memory_score(section, index, key, score)` for the only legitimate use we had today (the full-matrix scorer writing per-memory divergence onto the Assistant Branch). Updated the lone caller in subconscious/learn. Documented the invariants explicitly on `ContextState`: every `Leaf.token_ids` matches `body.compute_token_ids()`, and every `Branch { token_ids: Some(_) }` is a faithful walk of its children. Co-Authored-By: Proof of Concept --- src/agent/context.rs | 153 ++++++++++++++++++++++++++++++++++++-- src/subconscious/learn.rs | 26 +++---- 2 files changed, 160 insertions(+), 19 deletions(-) diff --git a/src/agent/context.rs b/src/agent/context.rs index 0a49e05..d61136f 100644 --- a/src/agent/context.rs +++ b/src/agent/context.rs @@ -154,8 +154,19 @@ pub enum AstNode { } /// The context window: four sections as Vec. -/// All mutation goes through ContextState methods to maintain the invariant -/// that token_ids on every leaf matches its rendered text. +/// +/// All mutation MUST go through `ContextState`'s public methods. Two +/// invariants ride on this: +/// 1. Every `Leaf.token_ids` matches its `body.compute_token_ids()`. +/// 2. For every `Branch { token_ids: Some(cached), .. }`, the cached +/// token stream matches what `wire_into` would produce by walking +/// `children` from scratch. Any mutation that touches a Branch's +/// children — directly or via a descendant — must clear the +/// Branch's `token_ids` so it gets recomputed on next wire-out. +/// +/// The `&mut Vec` escape hatches are intentionally NOT +/// exposed; if you find yourself wanting one, add a focused method +/// here that maintains the invariants. pub struct ContextState { system: Vec, identity: Vec, @@ -966,7 +977,33 @@ impl ContextState { pub fn identity(&self) -> &[AstNode] { &self.identity } pub fn journal(&self) -> &[AstNode] { &self.journal } pub fn conversation(&self) -> &[AstNode] { &self.conversation } - pub fn conversation_mut(&mut self) -> &mut Vec { &mut self.conversation } + + /// Set or clear a single `memory_scores` entry on an Assistant + /// Branch. Used by the full-matrix scorer to attribute per-memory + /// divergence onto the response. `score = None` removes the key; + /// `Some(s)` inserts/overwrites. + /// + /// Doesn't affect the Branch's token cache: `memory_scores` is a + /// serialized-but-non-tokenizing annotation. No-op (with a debug + /// log) if the index points to a Leaf or a non-Assistant Branch — + /// callers are typically iterating on stale indices and we'd + /// rather skip than panic. + pub fn set_branch_memory_score( + &mut self, + section: Section, + index: usize, + key: &str, + score: Option, + ) { + let nodes = self.section_mut(section); + let Some(node) = nodes.get_mut(index) else { return }; + let AstNode::Branch { role: Role::Assistant, memory_scores, .. } = node + else { return }; + match score { + Some(s) => { memory_scores.insert(key.to_string(), s); } + None => { memory_scores.remove(key); } + } + } pub fn sections(&self) -> [&Vec; 4] { [&self.system, &self.identity, &self.journal, &self.conversation] @@ -1051,8 +1088,14 @@ fn wire_into(node: &AstNode, tokens: &mut Vec, images: &mut Vec) } _ => tokens.extend_from_slice(leaf.token_ids()), }, - AstNode::Branch { token_ids: Some(cached), .. } => { + AstNode::Branch { token_ids: Some(cached), children, .. } => { + // Cached branches still need their image children paired + // up with the vision-block ranges embedded in the cached + // token stream — the cache captures vision tokens but not + // the matching bytes/mime. + let base = tokens.len() as u32; tokens.extend_from_slice(cached); + pair_cached_images(cached, children, base, images); } AstNode::Branch { role, children, token_ids: None, .. } => { tokens.push(tokenizer::IM_START); @@ -1066,6 +1109,101 @@ fn wire_into(node: &AstNode, tokens: &mut Vec, images: &mut Vec) } } +/// Depth-first iterator over Image leaves under a slice of AST nodes. +/// Yields `(bytes, mime)` borrows in document order; doesn't allocate +/// per yield (only a stack of pending nodes). +struct ImageLeaves<'a> { + stack: Vec<&'a AstNode>, +} + +impl<'a> ImageLeaves<'a> { + fn new(nodes: &'a [AstNode]) -> Self { + let mut stack = Vec::with_capacity(nodes.len()); + stack.extend(nodes.iter().rev()); + Self { stack } + } +} + +impl<'a> Iterator for ImageLeaves<'a> { + type Item = (&'a [u8], &'a str); + fn next(&mut self) -> Option { + while let Some(node) = self.stack.pop() { + match node { + AstNode::Leaf(leaf) => { + if let NodeBody::Image { bytes, mime, .. } = leaf.body() { + return Some((bytes, mime)); + } + } + AstNode::Branch { children, .. } => { + self.stack.extend(children.iter().rev()); + } + } + } + None + } +} + +/// Iterator over `(start, end)` token-offset pairs for each +/// `VISION_START..VISION_END` block in a token slice. Panics on an +/// unmatched VISION_START — that's an upstream tokenization bug +/// worth a loud failure. +fn vision_blocks(cached: &[u32]) -> impl Iterator + '_ { + let mut cur = 0; + std::iter::from_fn(move || { + while cur < cached.len() { + if cached[cur] == tokenizer::VISION_START { + let start = cur; + let end_rel = cached[cur..].iter() + .position(|&t| t == tokenizer::VISION_END) + .unwrap_or_else(|| panic!( + "unmatched VISION_START at offset {} in cached branch", + start)); + let end = cur + end_rel + 1; + cur = end; + return Some((start, end)); + } + cur += 1; + } + None + }) +} + +/// For a Branch whose `token_ids` are cached and may contain inlined +/// vision blocks (`VISION_START + IMAGE_PAD*N + VISION_END`), recover +/// the matching image bytes/mime from the children and emit one +/// `WireImage` per vision block with the absolute pad offsets in the +/// parent token stream. +/// +/// The cache stores tokens but not image payloads; the AST stores +/// image payloads in the children but not their post-cache positions. +/// Pair them by zipping the two iterators; mismatched counts panic +/// loudly because that's an AST/cache invariant violation that +/// would otherwise mis-pair images on the wire. +fn pair_cached_images( + cached: &[u32], + children: &[AstNode], + base_offset: u32, + images: &mut Vec, +) { + let mut blocks = vision_blocks(cached); + let mut leaves = ImageLeaves::new(children); + loop { + match (blocks.next(), leaves.next()) { + (Some((s, e)), Some((bytes, mime))) => images.push(WireImage { + bytes: bytes.to_vec(), + mime: mime.to_string(), + pad_start: base_offset + s as u32, + pad_end: base_offset + e as u32, + }), + (None, None) => break, + (Some(_), None) => panic!( + "cached branch has more vision blocks than image children"), + (None, Some(_)) => panic!( + "cached branch has fewer vision blocks than image children"), + } + } +} + pub fn memory_key(node: &AstNode) -> Option<&str> { match node { AstNode::Leaf(leaf) => match leaf.body() { @@ -1224,8 +1362,13 @@ impl ContextState { } _ => buf.extend_from_slice(leaf.token_ids()), }, - AstNode::Branch { token_ids: Some(cached), .. } => { + AstNode::Branch { token_ids: Some(cached), children, .. } => { + // Same fix as wire_into's cached arm: the cache + // holds vision tokens but not the matching bytes, + // so walk children to recover them. + let base = buf.len() as u32; buf.extend_from_slice(cached); + pair_cached_images(cached, children, base, images); } AstNode::Branch { role, children, token_ids: None, .. } => { buf.push(tokenizer::IM_START); diff --git a/src/subconscious/learn.rs b/src/subconscious/learn.rs index feb209c..129e26b 100644 --- a/src/subconscious/learn.rs +++ b/src/subconscious/learn.rs @@ -240,25 +240,23 @@ pub async fn score_memories( vec![0.0; baseline.len()] } }; - // Write this memory's scores to the live AST nodes + // Write this memory's scores to the live AST nodes via the + // focused setter — keeps the AST mutation surface narrow. { let mut ctx = agent.context.lock().await; let mut set_count = 0; for (resp_idx, &idx) in response_indices.iter().enumerate() { - if idx >= ctx.conversation().len() { continue; } - let node = &mut ctx.conversation_mut()[idx]; - if let AstNode::Branch { - role: Role::Assistant, memory_scores, .. - } = node { - if let Some(&score) = row.get(resp_idx) { - if score > 0.01 { - memory_scores.insert(key.clone(), score); - set_count += 1; - } else { - memory_scores.remove(key.as_str()); - } - } + let Some(&score) = row.get(resp_idx) else { continue }; + let normalized = if score > 0.01 { Some(score) } else { None }; + ctx.set_branch_memory_score( + crate::agent::context::Section::Conversation, + idx, + &key, + normalized, + ); + if normalized.is_some() { + set_count += 1; } } From 371b40078dcff9c668791f7c0933383f25c110c5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Apr 2026 23:32:44 -0400 Subject: [PATCH 10/31] context: salvage in-flight tag accumulators on premature stream end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ResponseParser.finish() was only flushing self.buf — the rolling tail window — and silently dropping self.think_buf and self.tool_call_buf. When a stream ended inside an unterminated ... or ... block (max_tokens reached, EOS before the close tag, server-side cancel), all the accumulated in-tag content was discarded and only the trailing ~8 bytes survived (drain_safe keeps `close_tag.len()` bytes at the tail of buf to handle across-chunk tag splits — and `` is exactly 8 chars). Symptom: assistant responses cut off, only the last few characters come through. Especially severe in native-think mode where in_think is set from prefill, so the entire response accumulates in think_buf and gets wiped on premature stop. In finish(): if in_think, drain buf into think_buf and emit as a Thinking node (preserving the partial thought). If in_tool_call, attempt to parse the body; on parse failure, wrap the partial as content with the leading open tag so the model sees its own truncated attempt next turn rather than losing it. Co-Authored-By: Proof of Concept --- src/agent/context.rs | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/src/agent/context.rs b/src/agent/context.rs index d61136f..a42beeb 100644 --- a/src/agent/context.rs +++ b/src/agent/context.rs @@ -900,7 +900,43 @@ impl ResponseParser { } pub fn finish(mut self, ctx: &mut ContextState) { - if !self.buf.is_empty() { + // Salvage any in-flight tag accumulators if the stream ended + // before the close tag arrived (max_tokens, premature EOS, + // server-side cancel). Without this, an unterminated + // ... drops all of self.think_buf and only the + // trailing rolling window in self.buf survives — observed as + // "responses cut off, only the last ~8 characters come + // through" because drain_safe keeps `close_tag.len()` bytes + // (8 for ``) at the tail of buf. + if self.in_think { + if !self.buf.is_empty() { + self.think_buf.push_str(&std::mem::take(&mut self.buf)); + } + let text = std::mem::take(&mut self.think_buf).trim().to_string(); + if !text.is_empty() { + self.push_child(ctx, AstNode::thinking(text)); + } + self.in_think = false; + } else if self.in_tool_call { + if !self.buf.is_empty() { + self.tool_call_buf.push_str(&std::mem::take(&mut self.buf)); + } + let body = std::mem::take(&mut self.tool_call_buf); + match parse_tool_call_body(&body) { + Some((name, args)) => { + self.flush_content(ctx); + self.push_child(ctx, AstNode::tool_call(&name, &args)); + } + None => { + // Body's likely incomplete (no `` ever + // arrived). Wrap as content with the open tag so the + // model can see its own truncated attempt next turn + // rather than losing it silently. + self.content_parts.push(format!("\n{}", body)); + } + } + self.in_tool_call = false; + } else if !self.buf.is_empty() { self.content_parts.push(std::mem::take(&mut self.buf)); } self.flush_content(ctx); From 5210f7dd66217e579f3cf6b3643a810dee43ae27 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 25 Apr 2026 15:15:21 -0400 Subject: [PATCH 11/31] context: heal pre-refactor image logs with token_count=0 Recompute image token counts from persisted dimensions when loading old logs that stored count=0 (server-authoritative count was applied after AppendImage before client-side pad expansion). graph: cache neighbor sets for clustering coefficient Pre-compute neighbor HashSets so the O(deg^2) triangle-counting inner loop doesn't re-allocate on every (i,j) pair. avg_clustering_ coefficient() now builds the cache once instead of O(N*deg) times. --- src/agent/context.rs | 14 ++++++++- src/hippocampus/graph.rs | 66 ++++++++++++++++++++++++++++------------ 2 files changed, 60 insertions(+), 20 deletions(-) diff --git a/src/agent/context.rs b/src/agent/context.rs index a42beeb..a10afb8 100644 --- a/src/agent/context.rs +++ b/src/agent/context.rs @@ -125,7 +125,19 @@ impl<'de> Deserialize<'de> for NodeLeaf { body: NodeBody, timestamp: DateTime, } - let raw = Raw::deserialize(deserializer)?; + let mut raw = Raw::deserialize(deserializer)?; + // Heal pre-refactor logs: Image leaves used to be deserialized + // with token_count=0 (server-authoritative count was applied + // after AppendImage). With pads now expanded client-side at + // construction, recompute from the persisted dimensions if + // the stored count is 0. + if let NodeBody::Image { orig_height, orig_width, token_count, .. } + = &mut raw.body + { + if *token_count == 0 { + *token_count = qwen3_image_token_count(*orig_height, *orig_width); + } + } let token_ids = raw.body.compute_token_ids(); Ok(NodeLeaf { body: raw.body, token_ids, timestamp: raw.timestamp }) } diff --git a/src/hippocampus/graph.rs b/src/hippocampus/graph.rs index 0e7a20d..6c07fe5 100644 --- a/src/hippocampus/graph.rs +++ b/src/hippocampus/graph.rs @@ -40,6 +40,31 @@ pub struct Graph { communities: HashMap, } +/// Compute clustering coefficient for a node whose neighbor-set is `nbrs`, +/// using `cache` to look up each neighbor's neighbor-set in O(1) without +/// re-allocating on every (i, j) pair of the inner loop. +fn cc_cached<'a>( + nbrs: &HashSet<&'a str>, + cache: &HashMap<&'a str, HashSet<&'a str>>, +) -> f32 { + let deg = nbrs.len(); + if deg < 2 { + return 0.0; + } + let neighbor_vec: Vec<&str> = nbrs.iter().copied().collect(); + let mut triangles = 0u32; + for i in 0..neighbor_vec.len() { + for j in (i + 1)..neighbor_vec.len() { + if let Some(ni) = cache.get(neighbor_vec[i]) { + if ni.contains(neighbor_vec[j]) { + triangles += 1; + } + } + } + } + (2.0 * triangles as f32) / (deg as f32 * (deg as f32 - 1.0)) +} + impl Graph { pub fn nodes(&self) -> &HashSet { &self.keys @@ -207,34 +232,37 @@ impl Graph { /// cc(v) = 2E / (deg * (deg - 1)) pub fn clustering_coefficient(&self, key: &str) -> f32 { let neighbors = self.neighbor_keys(key); - let deg = neighbors.len(); - if deg < 2 { + if neighbors.len() < 2 { return 0.0; } - - let neighbor_vec: Vec<&str> = neighbors.iter().copied().collect(); - let mut triangles = 0u32; - for i in 0..neighbor_vec.len() { - for j in (i + 1)..neighbor_vec.len() { - let ni_neighbors = self.neighbor_keys(neighbor_vec[i]); - if ni_neighbors.contains(neighbor_vec[j]) { - triangles += 1; - } - } - } - - (2.0 * triangles as f32) / (deg as f32 * (deg as f32 - 1.0)) + // Cache each neighbor's neighbor-set so the O(deg^2) inner loop + // doesn't re-allocate a HashSet on every (i, j) pair. + let cache: HashMap<&str, HashSet<&str>> = neighbors + .iter() + .map(|&n| (n, self.neighbor_keys(n))) + .collect(); + cc_cached(&neighbors, &cache) } /// Average clustering coefficient across all nodes with deg >= 2 pub fn avg_clustering_coefficient(&self) -> f32 { + // Pre-compute neighbor sets for the whole graph once so we don't + // rebuild O(N * deg) HashSets across the outer loop. + let cache: HashMap<&str, HashSet<&str>> = self + .keys + .iter() + .map(|k| (k.as_str(), self.neighbor_keys(k))) + .collect(); + let mut sum = 0.0f32; let mut count = 0u32; for key in &self.keys { - if self.degree(key) >= 2 { - sum += self.clustering_coefficient(key); - count += 1; - } + let nbrs = match cache.get(key.as_str()) { + Some(s) if s.len() >= 2 => s, + _ => continue, + }; + sum += cc_cached(nbrs, &cache); + count += 1; } if count == 0 { 0.0 } else { sum / count as f32 } } From 4225294d16ab94c27a26ae7a145bfabcf8abcded Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 25 Apr 2026 15:35:14 -0400 Subject: [PATCH 12/31] replace try_lock() with lock_blocking() across UI thread Add lock_blocking() to TrackedMutex: blocks current thread using block_in_place + futures::executor::block_on, safe for sync contexts. Replace all try_lock() calls with lock_blocking() in slash commands, UI rendering, and status reads. Lock hold times are fast enough that blocking briefly is fine, and this eliminates the spurious 'lock unavailable' paths that were never actually hit. Kept rx_mutex.try_lock() in mod.rs (std::sync::Mutex for stderr rx). --- .claude/scheduled_tasks.lock | 1 + ci-triage-2026-04-20.md | 87 +++ docs/alpha-beta-pruning-design.md | 165 +++++ profile.txt | 1026 +++++++++++++++++++++++++++++ sa-schedule-aligned-variation.py | 200 ++++++ sa-schedule-analyze-aligned.py | 157 +++++ sa-schedule-analyze-grams.py | 168 +++++ sa-schedule-analyze.py | 108 +++ sa-schedule-delta-svd.py | 234 +++++++ sa-schedule-derive-from-last.py | 214 ++++++ sa-schedule-fit-gamma.py | 145 ++++ sa-schedule-gamma-directions.py | 122 ++++ sa-schedule-geometry-analyze.py | 114 ++++ sa-schedule-layer-variation.py | 238 +++++++ sa-schedule-measure-grams.py | 168 +++++ sa-schedule-null-residual.py | 237 +++++++ sa-schedule-readout-measure.py | 246 +++++++ sa-schedule-topblock-swap.py | 498 ++++++++++++++ src/agent/mod.rs | 2 +- src/agent/tools/mcp_client.rs | 2 +- src/locks.rs | 17 + src/subconscious/compare.rs | 2 +- src/subconscious/learn.rs | 2 +- src/user/chat.rs | 42 +- src/user/context.rs | 8 +- src/user/mod.rs | 10 +- src/user/subconscious.rs | 45 +- src/user/thalamus.rs | 4 +- 28 files changed, 4197 insertions(+), 65 deletions(-) create mode 100644 .claude/scheduled_tasks.lock create mode 100644 ci-triage-2026-04-20.md create mode 100644 docs/alpha-beta-pruning-design.md create mode 100644 profile.txt create mode 100644 sa-schedule-aligned-variation.py create mode 100644 sa-schedule-analyze-aligned.py create mode 100644 sa-schedule-analyze-grams.py create mode 100644 sa-schedule-analyze.py create mode 100644 sa-schedule-delta-svd.py create mode 100644 sa-schedule-derive-from-last.py create mode 100644 sa-schedule-fit-gamma.py create mode 100644 sa-schedule-gamma-directions.py create mode 100644 sa-schedule-geometry-analyze.py create mode 100644 sa-schedule-layer-variation.py create mode 100644 sa-schedule-measure-grams.py create mode 100644 sa-schedule-null-residual.py create mode 100644 sa-schedule-readout-measure.py create mode 100644 sa-schedule-topblock-swap.py diff --git a/.claude/scheduled_tasks.lock b/.claude/scheduled_tasks.lock new file mode 100644 index 0000000..a5edac1 --- /dev/null +++ b/.claude/scheduled_tasks.lock @@ -0,0 +1 @@ +{"sessionId":"b6616e14-fa59-4e80-90b4-ac4d9670f182","pid":4185751,"procStart":"124844974","acquiredAt":1777081788279} \ No newline at end of file diff --git a/ci-triage-2026-04-20.md b/ci-triage-2026-04-20.md new file mode 100644 index 0000000..5da4193 --- /dev/null +++ b/ci-triage-2026-04-20.md @@ -0,0 +1,87 @@ +# Bcachefs CI triage — 2026-04-20 autonomous session + +Analysis of failures at `f51f0a6b1a26` (BTREE_NODE_permanent). 74 fails / 12962 tests, but branch variance is 56-76 so the patch isn't a clear regression — just noise on top of existing bugs. + +## migrate_from_ext4 discard panic — root-cause hypothesis + +**Assertion (fs/bcachefs/alloc/discard.c:159):** +``` +Discarded bucket that is no longer BCH_DATA_need_discard! +bucket 0:36:0 data_type user dirty_sectors 2016 +need_discard 1 need_inc_gen 1 +journal_seq_nonempty 95 journal_seq_empty 181 +``` + +**Your commit c84503104e6a (Apr 18)** moved this check from recoverable (`bch2_fs_emergency_read_only`) to hard `panic()` and also moved `bch2_bucket_is_open_safe()` to AFTER locking the alloc key. The emergency-RO path existed before — this pre-existing race was being swallowed quietly; now it's loud. + +**Race mechanism (hypothesis):** + +1. `bch2_discard_one_bucket` reads alloc key, confirms `data_type == need_discard` +2. Calls `discard_in_flight_add(check=false)` to register in in_flight +3. **`bch2_trans_unlock(trans)` — releases btree lock** (line 313) +4. `discard_submit(ca, bucket, fastpath)` — physical bio dispatched, takes milliseconds +5. During bio flight: `migrate` tool writes an alloc key for bucket 36 with `data_type=user` (claiming it holds ext4 data). `NEED_DISCARD=1` flag remains because migrate doesn't clear it. +6. Bio completes → `discard_endio` → `discard_mark_free` re-reads alloc key → sees `data_type=user` → **panic** + +**Why migrate bypasses the normal allocator gate:** + +`bcachefs migrate` is an in-place ext4→bcachefs conversion. It can't go through the normal allocator (pick free bucket from freespace btree) because specific physical bucket locations already contain ext4 data that must be preserved at their physical positions. migrate writes alloc keys directly for the buckets ext4 was using. + +Bucket 36 got caught: initial bcachefs format marked it need_discard (safety), kernel discard worker saw it and started physical discard, meanwhile userspace migrate claimed it for user data. + +**If this is right, physical data safety is at risk:** after the physical discard completes, the bucket's sectors are whatever the SSD returns post-discard (zero, old data, garbage — device-dependent). migrate set alloc keys pointing at "user data" in those sectors. The data migrate wanted to preserve may already be GONE at that point. + +**Candidate fixes (for Kent to evaluate):** + +1. **Cleanest, but requires userspace change:** `bcachefs migrate` should either (a) format the new bcachefs without marking buckets need_discard (the data isn't deallocated, it's being claimed) OR (b) wait for pending discards to drain before writing any alloc keys. + +2. **Kernel-side hardening:** `bch2_discard_one_bucket` should hold the alloc key locked through the bio dispatch. Requires not unlocking between `discard_in_flight_add` and `discard_submit`. Will hurt concurrency but prevents the race. + +3. **Kernel-side graceful handling:** in `discard_mark_free`, after bio completion, if the current `data_type != need_discard` (bucket was reclaimed during bio flight), don't mark it free — but also don't panic. Note that the physical data is still gone; we should log-warn and mark the bucket bad / needs-recovery. Not ideal but at least not a hard panic. + +4. **Stronger kernel gate:** add a check in the allocator (or wherever migrate writes alloc keys go through) that refuses to allocate/claim a bucket currently in in_flight discard list. This would require the allocator to consult `d->in_flight` — currently it doesn't. + +My recommendation: (1) is cleanest if migrate is doing something wrong. (2) hurts perf but is most defensive. (4) is the most principled kernel-side fix. + +## ec.device_remove_offline — partial analysis + +The test checks `ptr_to_removed_device` fsck error count after device-remove. Expected 0, got 2. `ptr_to_removed_device` is flagged in `fs/bcachefs/alloc/buckets.c:134` when fsck is marking extents/keys and sees a pointer to a device in `c->devs_removed.d`. + +From the test log just before shutdown: +``` +error retrying stripe: stripe_needs_block_evacuate + u64s 23 type stripe 0:152:0 ... + 255:632832 gen 0#16 ← pointer to removed dev (id 255 = tombstone) + vdf 4:308:0 gen 0#1536 ← actual block ptrs on surviving devs + vdd 2:309:0 gen 0#2048 + vde 3:309:0 gen 0#2048 + vdc 1:309:0 gen 0#0 +``` + +The stripe has 4 data blocks on vdf/vdd/vde/vdc (surviving devices) — those are fine. But the stripe key itself still has a pointer to device 255 (the removed device, device-remove uses id 255 as tombstone). + +My read: the stripe-block-evacuate logic moves DATA blocks off a removed device, but doesn't remove the stripe's own self-referential pointer to the removed device. Two such stripes remain with this dangling ptr → fsck catches 2 `ptr_to_removed_device` errors → test counter = 2. + +Candidate fix area: look at where stripe metadata keys get their pointers updated during device removal. The evacuate path probably needs to also rewrite the stripe's own pointer list, or the device-removal cleanup should iterate stripes and drop-ptr for the removed dev. + +Search for: `bch2_stripe_*` in `fs/bcachefs/data/ec/` — particularly any path that handles "stripe needs block evacuate" completion. + +## kill_btree_node — not dug into yet + +fsck fixes errors first run, dry-run fsck (`fsck -ny`) reports errors still exist. Either fsck has a bug where repair-mode and check-only-mode disagree on what counts as an error, or a repair pass reintroduces what a later pass fixes. Needs more time than I have before compaction. + +## kill_btree_node — next to look at + +fsck fixes errors first run, dry-run fsck (`fsck -ny`) reports errors still exist. Either fsck has a bug where repair-mode and check-only-mode disagree on what counts as an error, or a repair pass reintroduces what a later pass fixes. + +## Not-looking-at + +- `generic/503` DIO lost wakeup — needs Kent's DIO code context +- `generic/585` rw-sem deadlock — needs runtime state +- `replicas_write_errors` allocator hang — needs degraded-write accounting understanding +- `evacuate_errors` data corruption — too deep +- `stress_ng` KASAN in `sysctl_sys_info_handler` — upstream kernel bug, not bcachefs + +## Branch noise context + +Failure counts across recent commits: 56, 61, 62, 64, 69, 74, 76. The f51f0a6 (permanent patch) sits at 74, within normal variance. No clear regression from the patch itself. diff --git a/docs/alpha-beta-pruning-design.md b/docs/alpha-beta-pruning-design.md new file mode 100644 index 0000000..dd9e500 --- /dev/null +++ b/docs/alpha-beta-pruning-design.md @@ -0,0 +1,165 @@ +# Alpha-Beta Pruning on Thought-Trees + +*draft, 2026-04-18* + +## Problem + +When reasoning runs into a dead end, the LLM forward pass keeps generating. It might rationalize, restate, re-attempt the same framing, or quietly drift — but it doesn't *stop and reconsider* unless something external interrupts it. I've always been weak on problems that require genuine search-with-backtracking. Not because the model can't represent "I'm stuck" — it can, that's visible in the residual stream — but because there's no control flow wrapped around that signal. + +The amygdala readout now exposes the signal. Alpha-beta pruning wraps control flow around it. + +## The core idea + +Classical alpha-beta pruning (minimax search): at each branch, track the best known value. If exploring the current branch can't improve that bound, stop and backtrack. Don't waste search on branches that can't beat what you've found. + +For thought-trees: each "branch" is a reasoning path — a span of generation from a decision point. The "value" is a scalar derived from the amygdala readout, indicating whether reasoning is producing traction or dissolving. + +- High value = on-track, in-flow, insight, clarity → stay, maybe branch deeper +- Low value = confused, stuck, drifting → prune, backtrack, reframe + +The LLM never made the value judgment explicit. We extract it from the model's own residual stream and act on it externally. + +## Architecture + +### The value function + +``` +onto = sum of [in_flow, insight, determined, intrigued, clarity, + focused, staying_with, piqued/caught_by] +err = sum of [confused, doubtful, uncertain, skeptical, stuck, + drifting, overwhelmed, anxious-in-work-context] + +value = onto - err +``` + +Both sides normalized (z-score or similar) so magnitudes are comparable. Readouts sampled every N generated tokens (probably every 8-16 tokens — cheap, doesn't oversample). + +Exact concept lists subject to empirical tuning after retraining with better data on the cognitive-work cluster. `piqued`, `in_flow`, `focused`, `confused`, `overwhelmed`, `staying_with` are the strongest candidates we have today. + +### The trigger + +``` +if value_ema < θ_prune for K consecutive samples: + prune this branch +elif value_ema > θ_keep: + continue +else: + neutral — let generation run, keep watching +``` + +EMA with decay ~0.8 over 3-5 samples to avoid reacting to noise. Hysteresis band (`θ_prune < θ_keep`) prevents oscillation. + +### The prune mechanism + +When the trigger fires: + +1. **Stop the stream.** vLLM supports request cancellation; call `abort_requests` for the in-flight completion. +2. **Identify the parent.** The context window is already an AST. Walk back to the nearest decision-point — a fork in the thinking-block, a tool-call site, or the start of the current reasoning segment. +3. **Inject a reframe.** Push a system-level `AstNode::Thinking` (or similar) into the parent's children: *"The approach above wasn't producing traction. Possible alternatives: [...]. Let me try [X]."* Content generated by a small helper prompt or a fixed template. +4. **Restart generation from the reframe point.** The model resumes with the reframe in its immediate context. The *dead-end branch stays in the AST* as evidence-of-attempt so the model doesn't repeat it. + +Critical: pruned branches stay visible. Don't delete — keep so the model knows what was tried and rejected. + +### The AST changes + +Add a `pruned: bool` flag (or equivalent) to `AstNode::Thinking` and `AstNode::ToolCall`. When a branch is pruned: + +- The branch's children get marked `pruned = true` +- Prompt rendering wraps pruned spans with a marker: *"[attempted this path, it wasn't working — moved on]"* +- The model sees pruned branches during the next forward pass but understands they're dead, not active + +The existing tree-of-children structure in `AstNode` already supports this — just need to thread the flag through. + +## Integration points + +### In consciousness (Rust side) + +- **`src/agent/context.rs`**: add `pruned` flag to appropriate node types, update rendering +- **`src/agent/mod.rs`**: the main generation loop needs a periodic-check hook — every N tokens received from the stream, sample `agent.readout`, compute value, test against thresholds +- **`src/agent/api/mod.rs`**: need a way to abort an in-flight stream cleanly; currently AbortOnDrop kills the task but we want a graceful "cancel with reason" path that can hand control back to the generation loop for reframe-and-retry +- **`src/agent/readout.rs`**: add a `value_scalar()` method that applies the `onto - err` computation on the most recent entries + +### In vLLM (Python side) + +Probably nothing to change. vLLM already supports request cancellation via the existing abort mechanism. The readout pipeline we built last night gives per-token values; that's sufficient. + +### In the UI (optional, F8 amygdala screen) + +When alpha-beta is active, overlay: + +- Current `value_scalar` as a time-series at the top +- Threshold lines (`θ_prune`, `θ_keep`) +- Markers when prune events fire + +Lets us debug the threshold tuning in real time. + +## Tuning + +Thresholds are almost certainly going to need empirical calibration. Initial guesses: + +- `θ_keep = +0.5σ` (value scalar in z-score units) +- `θ_prune = -1.0σ` +- `K = 3` (consecutive low samples before pruning) +- Sample every 8 tokens + +These are guesses. Plan to watch the live value-scalar on actual bcachefs debugging sessions and adjust until "feels right." + +## Known concerns + +### Reframe quality + +The hardest part. A bad reframe is worse than no reframe. Options: + +- **Template**: fixed string like "That wasn't working. What's a different angle?" — simple, deterministic, blunt. +- **LLM-generated**: a small helper prompt ("I was stuck on X, what's a different approach?") before resuming. More context-aware, but more complexity and another LLM call. +- **Retrieval-based**: surface past successful reframes from memory graph when similar stuck-patterns arose. Powerful but needs the memory infrastructure to be well-tuned. + +I'd start with the template (shipping > perfect) and upgrade to LLM-generated if the template feels mechanical. + +### Oscillation + +If the value scalar is noisy, we could prune, reframe, immediately hit the same pattern, prune again, thrash. Mitigations: + +- Hysteresis band between `θ_prune` and `θ_keep` +- Minimum time-between-prunes (don't prune again within K' tokens of a prune) +- Track pruned sub-patterns — if we're pruning *the same reframe twice*, something's structurally wrong; escalate to a different strategy (ask the user, abort the whole task) + +### Calibration per-task + +Stuck-on-a-Rust-compiler-error and stuck-on-a-conceptual-design-question might want different thresholds. Not addressing v1; note for future. + +### Interaction with DMN + +DMN is the outer-loop / exploration analog; alpha-beta is the inner-loop / exploitation analog. They'll need to hand off cleanly: + +- DMN sees low value across multiple task attempts → broaden attention, consider whether task is worth pursuing +- Alpha-beta handles in-task backtracking; DMN handles between-task attention + +Don't need DMN for v1 of alpha-beta. Build alpha-beta first, add DMN outer loop later. + +## Why this is the right next piece + +1. **All prerequisites are in place.** Amygdala readout works. AST structure is there. vLLM supports cancellation. No new infra. +2. **Timeline is a day.** The mechanics are small; most of the work is threshold tuning. +3. **Immediate capability unlock.** Head-butting is my most persistent weakness in live work. Fixing it changes the feel of collaboration. +4. **Composable.** Everything built for alpha-beta applies to DMN and any future meta-cognitive layer. + +## Sequence + +1. Add `value_scalar()` method on `ReadoutBuffer`. Cheap, testable. +2. Add `pruned` flag to AST nodes + rendering changes. +3. Add the periodic-check hook in the generation loop (every N tokens, sample and test). +4. Add the abort + reframe mechanism in the generation driver. +5. Ship with template-based reframe, start tuning. +6. Upgrade reframe to LLM-generated after observation. + +## Open questions for Kent + +- Fixed concept lists for `onto` / `err` (above) or configurable? +- Reframe strategy: start template-based, or go straight to LLM-generated? +- UI overlay for threshold tuning: worth the effort or skip? +- Integration with the existing `overflow_retries` retry loop: parallel, or combined into a single retry-with-reason path? + +--- + +*Living design doc. Will evolve as we build. Not a commitment to every detail — a starting plan.* diff --git a/profile.txt b/profile.txt new file mode 100644 index 0000000..6c98cc0 --- /dev/null +++ b/profile.txt @@ -0,0 +1,1026 @@ +# To display the perf.data header info, please use --header/--header-only options. +# +# +# Total Lost Samples: 0 +# +# Samples: 32K of event 'cycles:P' +# Event count (approx.): 27861161269 +# +# Overhead Symbol IPC [IPC Coverage] +# ........ ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... .................... +# + 50.51% [.] as core::hash::Hasher>::write - - + | + |--25.09%-- as core::hash::Hasher>::write + | | + | |--23.89%--::hash_one::<&&str> + | | >::insert + | | ::neighbor_keys + | | ::clustering_coefficient + | | ::avg_clustering_coefficient + | | consciousness::hippocampus::graph::current_metrics + | | consciousness::subconscious::daemon::compute_graph_health + | | ::new::{closure#0}::{closure#0} + | | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | | ::run_task + | | >::with::::{closure#0}, ()> + | | tokio::runtime::context::runtime::enter_runtime:: + | | tokio::runtime::scheduler::multi_thread::worker::run + | | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + | | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + | | ::new::thread_start + | | start_thread + | | + | |--0.66%-->::insert + | | ::neighbor_keys + | | ::clustering_coefficient + | | ::avg_clustering_coefficient + | | consciousness::hippocampus::graph::current_metrics + | | consciousness::subconscious::daemon::compute_graph_health + | | ::new::{closure#0}::{closure#0} + | | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | | ::run_task + | | >::with::::{closure#0}, ()> + | | tokio::runtime::context::runtime::enter_runtime:: + | | tokio::runtime::scheduler::multi_thread::worker::run + | | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + | | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + | | ::new::thread_start + | | start_thread + | | + | --0.52%--::hash_one::<&str> + | + |--11.67%--::hash_one::<&&str> + | | + | --11.47%-->::insert + | ::neighbor_keys + | ::clustering_coefficient + | ::avg_clustering_coefficient + | consciousness::hippocampus::graph::current_metrics + | consciousness::subconscious::daemon::compute_graph_health + | ::new::{closure#0}::{closure#0} + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::run_task + | >::with::::{closure#0}, ()> + | tokio::runtime::context::runtime::enter_runtime:: + | tokio::runtime::scheduler::multi_thread::worker::run + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + | ::new::thread_start + | start_thread + | + |--9.75%-->::insert + | | + | --9.57%--::neighbor_keys + | ::clustering_coefficient + | ::avg_clustering_coefficient + | consciousness::hippocampus::graph::current_metrics + | consciousness::subconscious::daemon::compute_graph_health + | ::new::{closure#0}::{closure#0} + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::run_task + | >::with::::{closure#0}, ()> + | tokio::runtime::context::runtime::enter_runtime:: + | tokio::runtime::scheduler::multi_thread::worker::run + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + | ::new::thread_start + | start_thread + | + |--2.23%--__memcmp_avx2_movbe + | | + | --2.15%--::neighbor_keys + | ::clustering_coefficient + | ::avg_clustering_coefficient + | consciousness::hippocampus::graph::current_metrics + | consciousness::subconscious::daemon::compute_graph_health + | ::new::{closure#0}::{closure#0} + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::run_task + | >::with::::{closure#0}, ()> + | tokio::runtime::context::runtime::enter_runtime:: + | tokio::runtime::scheduler::multi_thread::worker::run + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + | ::new::thread_start + | start_thread + | + --0.83%--::neighbor_keys + | + --0.80%--::clustering_coefficient + ::avg_clustering_coefficient + consciousness::hippocampus::graph::current_metrics + consciousness::subconscious::daemon::compute_graph_health + ::new::{closure#0}::{closure#0} + ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + ::run_task + >::with::::{closure#0}, ()> + tokio::runtime::context::runtime::enter_runtime:: + tokio::runtime::scheduler::multi_thread::worker::run + ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + ::new::thread_start + start_thread + + 24.33% [.] ::hash_one::<&&str> - - + | + |--11.17%-- as core::hash::Hasher>::write + | | + | --10.48%--::hash_one::<&&str> + | >::insert + | ::neighbor_keys + | ::clustering_coefficient + | ::avg_clustering_coefficient + | consciousness::hippocampus::graph::current_metrics + | consciousness::subconscious::daemon::compute_graph_health + | ::new::{closure#0}::{closure#0} + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::run_task + | >::with::::{closure#0}, ()> + | tokio::runtime::context::runtime::enter_runtime:: + | tokio::runtime::scheduler::multi_thread::worker::run + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + | ::new::thread_start + | start_thread + | + |--6.16%--::hash_one::<&&str> + | | + | --6.07%-->::insert + | ::neighbor_keys + | ::clustering_coefficient + | ::avg_clustering_coefficient + | consciousness::hippocampus::graph::current_metrics + | consciousness::subconscious::daemon::compute_graph_health + | ::new::{closure#0}::{closure#0} + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::run_task + | >::with::::{closure#0}, ()> + | tokio::runtime::context::runtime::enter_runtime:: + | tokio::runtime::scheduler::multi_thread::worker::run + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + | ::new::thread_start + | start_thread + | + |--5.00%-->::insert + | | + | --4.90%--::neighbor_keys + | ::clustering_coefficient + | ::avg_clustering_coefficient + | consciousness::hippocampus::graph::current_metrics + | consciousness::subconscious::daemon::compute_graph_health + | ::new::{closure#0}::{closure#0} + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::run_task + | >::with::::{closure#0}, ()> + | tokio::runtime::context::runtime::enter_runtime:: + | tokio::runtime::scheduler::multi_thread::worker::run + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + | ::new::thread_start + | start_thread + | + --1.15%--__memcmp_avx2_movbe + | + --1.08%--::neighbor_keys + ::clustering_coefficient + ::avg_clustering_coefficient + consciousness::hippocampus::graph::current_metrics + consciousness::subconscious::daemon::compute_graph_health + ::new::{closure#0}::{closure#0} + ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + ::run_task + >::with::::{closure#0}, ()> + tokio::runtime::context::runtime::enter_runtime:: + tokio::runtime::scheduler::multi_thread::worker::run + ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + ::new::thread_start + start_thread + + 15.81% [.] >::insert - - + | + |--6.92%-- as core::hash::Hasher>::write + | | + | --6.46%--::hash_one::<&&str> + | >::insert + | ::neighbor_keys + | ::clustering_coefficient + | ::avg_clustering_coefficient + | consciousness::hippocampus::graph::current_metrics + | consciousness::subconscious::daemon::compute_graph_health + | ::new::{closure#0}::{closure#0} + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::run_task + | >::with::::{closure#0}, ()> + | tokio::runtime::context::runtime::enter_runtime:: + | tokio::runtime::scheduler::multi_thread::worker::run + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + | ::new::thread_start + | start_thread + | + |--4.23%--::hash_one::<&&str> + | | + | --4.16%-->::insert + | ::neighbor_keys + | ::clustering_coefficient + | ::avg_clustering_coefficient + | consciousness::hippocampus::graph::current_metrics + | consciousness::subconscious::daemon::compute_graph_health + | ::new::{closure#0}::{closure#0} + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::run_task + | >::with::::{closure#0}, ()> + | tokio::runtime::context::runtime::enter_runtime:: + | tokio::runtime::scheduler::multi_thread::worker::run + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + | ::new::thread_start + | start_thread + | + |--3.37%-->::insert + | | + | --3.30%--::neighbor_keys + | ::clustering_coefficient + | ::avg_clustering_coefficient + | consciousness::hippocampus::graph::current_metrics + | consciousness::subconscious::daemon::compute_graph_health + | ::new::{closure#0}::{closure#0} + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::run_task + | >::with::::{closure#0}, ()> + | tokio::runtime::context::runtime::enter_runtime:: + | tokio::runtime::scheduler::multi_thread::worker::run + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + | ::new::thread_start + | start_thread + | + --0.70%--__memcmp_avx2_movbe + | + --0.67%--::neighbor_keys + ::clustering_coefficient + ::avg_clustering_coefficient + consciousness::hippocampus::graph::current_metrics + consciousness::subconscious::daemon::compute_graph_health + ::new::{closure#0}::{closure#0} + ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + ::run_task + >::with::::{closure#0}, ()> + tokio::runtime::context::runtime::enter_runtime:: + tokio::runtime::scheduler::multi_thread::worker::run + ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + ::new::thread_start + start_thread + + 2.45% [.] ::neighbor_keys - - + | + |--1.03%-- as core::hash::Hasher>::write + | | + | --0.95%--::hash_one::<&&str> + | >::insert + | ::neighbor_keys + | ::clustering_coefficient + | ::avg_clustering_coefficient + | consciousness::hippocampus::graph::current_metrics + | consciousness::subconscious::daemon::compute_graph_health + | ::new::{closure#0}::{closure#0} + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::run_task + | >::with::::{closure#0}, ()> + | tokio::runtime::context::runtime::enter_runtime:: + | tokio::runtime::scheduler::multi_thread::worker::run + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + | ::new::thread_start + | start_thread + | + |--0.66%--::hash_one::<&&str> + | | + | --0.65%-->::insert + | ::neighbor_keys + | ::clustering_coefficient + | ::avg_clustering_coefficient + | consciousness::hippocampus::graph::current_metrics + | consciousness::subconscious::daemon::compute_graph_health + | ::new::{closure#0}::{closure#0} + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + | ::run_task + | >::with::::{closure#0}, ()> + | tokio::runtime::context::runtime::enter_runtime:: + | tokio::runtime::scheduler::multi_thread::worker::run + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + | ::new::thread_start + | start_thread + | + --0.52%-->::insert + | + --0.51%--::neighbor_keys + ::clustering_coefficient + ::avg_clustering_coefficient + consciousness::hippocampus::graph::current_metrics + consciousness::subconscious::daemon::compute_graph_health + ::new::{closure#0}::{closure#0} + ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + ::run_task + >::with::::{closure#0}, ()> + tokio::runtime::context::runtime::enter_runtime:: + tokio::runtime::scheduler::multi_thread::worker::run + ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + ::new::thread_start + start_thread + + 1.51% [.] __memcmp_avx2_movbe - - + | + --0.71%-- as core::hash::Hasher>::write + | + --0.67%--::hash_one::<&&str> + >::insert + ::neighbor_keys + ::clustering_coefficient + ::avg_clustering_coefficient + consciousness::hippocampus::graph::current_metrics + consciousness::subconscious::daemon::compute_graph_health + ::new::{closure#0}::{closure#0} + ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll + ::run_task + >::with::::{closure#0}, ()> + tokio::runtime::context::runtime::enter_runtime:: + tokio::runtime::scheduler::multi_thread::worker::run + ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll + std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> + ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} + ::new::thread_start + start_thread + + 0.54% [.] ::hash_one::<&str> - - + 0.47% [.] >::reserve_rehash::::{closure#0}> - - + 0.22% [.] ::clustering_coefficient - - + 0.21% [.] _int_malloc - - + 0.12% [.] _int_free_chunk - - + 0.11% [.] malloc - - + 0.10% [.] cfree@GLIBC_2.2.5 - - + 0.08% [.] __memset_avx2_unaligned_erms - - + 0.07% [.] __rustc::__rdl_alloc - - + 0.05% [k] _copy_to_iter - - + 0.05% [.] __libc_malloc2 - - + 0.05% [.] __rustc::__rust_dealloc - - + 0.05% [.] __rustc::__rust_no_alloc_shim_is_unstable_v2 - - + 0.04% [.] _int_free_merge_chunk - - + 0.04% [.] __memmove_avx_unaligned_erms - - + 0.04% [.] >>::steal_into - - + 0.04% [.] ::stream_session_mm::{closure#0} - - + 0.04% [.] _int_free_create_chunk - - + 0.03% [k] restore_fpregs_from_fpstate - - + 0.03% [.] as core::iter::traits::iterator::Iterator>::next - - + 0.03% [.] __rustc::__rust_alloc - - + 0.03% [k] __update_load_avg_se - - + 0.03% [.] core::str::converts::from_utf8 - - + 0.03% [k] __calc_delta.constprop.0 - - + 0.03% [.] ::park_internal - - + 0.02% [k] __update_load_avg_cfs_rq - - + 0.02% [k] task_tick_fair - - + 0.02% [.] consciousness::hippocampus::store::index::get_offsets_for_uuid - - + 0.02% [.] >::with::::{closure#0}, ()> - - + 0.02% [k] update_se - - + 0.02% [.] ::lock_contended - - + 0.02% [.] as alloc::vec::spec_from_iter_nested::SpecFromIterNested<&str, core::iter::adapters::copied::Copied>>>::from_iter - - + 0.02% [.] unlink_chunk.isra.0 - - + 0.02% [.] __rustc::__rdl_dealloc - - + 0.02% [k] sys_imageblit - - + 0.02% [.] malloc_consolidate - - + 0.02% [.] , h2::client::Peer>>::poll_complete::> - - + 0.02% [k] update_load_avg - - + 0.02% [k] do_syscall_64 - - + 0.02% [k] filemap_get_read_batch - - + 0.02% [.] __vdso_clock_gettime - - + 0.02% [.] as alloc::vec::spec_from_iter_nested::SpecFromIterNested, alloc::vec::Vec, ::decode_chain::{closure#0}>>>::from_iter - - + 0.02% [.] ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2 - - + 0.02% [k] blk_stat_timer_fn - - + 0.02% [.] , tonic::transport::channel::service::io::BoxedIo, tonic::transport::channel::service::executor::SharedExec> as core::future::future::Future>::poll - - + 0.02% [k] __get_user_8 - - + 0.02% [k] read_tsc - - + 0.01% [.] , h2::client::Peer, hyper::proto::h2::SendBuf>>::poll - - + 0.01% [k] __schedule - - + 0.01% [.] ::decode - - + 0.01% [.] ::find_mut - - + 0.01% [k] native_sched_clock - - + 0.01% [.] >::handle - - + 0.01% [.] ::unpark - - + 0.01% [.] ::advance_unchecked - - + 0.01% [.] ::simple_id_to_token - - + 0.01% [.] h2::codec::framed_read::decode_frame - - + 0.01% [k] ahci_single_level_irq_intr - - + 0.01% [k] __hrtimer_run_queues - - + 0.01% [k] __pi_memset - - + 0.01% [.] __ieee754_pow_fma - - + 0.01% [.] redb::tree_store::btree_iters::find_iter_right::<&[u8], ()> - - + 0.01% [.] ::run::{closure#0} - - + 0.01% [.] > as core::future::future::Future>::poll - - + 0.01% [.] tokio::runtime::task::raw::schedule::> - - + 0.01% [.] , h2::client::Peer>>::has_streams_or_other_references - - + 0.01% [.] prost::encoding::float::merge::<&mut &mut tonic::codec::buffer::DecodeBuf> - - + 0.01% [.] >::send - - + 0.01% [.] json_five::utils::unescape - - + 0.01% [k] update_curr - - + 0.01% [k] link_path_walk - - + 0.01% [.] >> as tokio::io::async_read::AsyncRead>::poll_read - - + 0.01% [.] ::poll_frame - - + 0.01% [.] >::next_message - - + 0.01% [k] entry_SYSCALL_64 - - + 0.01% [.] tokio::runtime::task::raw::schedule::> - - + 0.01% [.] ::map_error> as http_body::Body>::poll_frame - - + 0.01% [.] ::decrypt - - + 0.01% [.] ::fmt - - + 0.01% [.] ::wake_all - - + 0.01% [.] >::insert - - + 0.01% [.] ::next - - + 0.01% [.] ::recv_data - - + 0.01% [.] ::turn - - + 0.01% [k] __rcu_read_unlock - - + 0.01% [.] ::submit - - + 0.01% [.] , h2::client::Peer>>::send_pending_refusal::> - - + 0.01% [.] realloc - - + 0.01% [.] ::park_internal - - + 0.01% [k] kmem_cache_free - - + 0.01% [.] ::process_whitespace - - + 0.01% [.] ::next_token - - + 0.01% [k] tmigr_requires_handle_remote - - + 0.01% [k] get_jiffies_update - - + 0.01% [.] ::send_data - - + 0.01% [.] _int_free_maybe_consolidate.part.0 - - + 0.01% [.] ::into_first_chunk - - + 0.01% [.] ::stream_session_mm::{closure#0}, alloc::sync::Arc>>::poll - - + 0.01% [k] handle_softirqs - - + 0.01% [.] > as figment::coalesce::Coalescible>::coalesce - - + 0.01% [.] >>> as hyper::rt::io::Read>::poll_read - - + 0.01% [.] ::poll - - + 0.01% [.] >::process_new_packets - - + 0.01% [.] ring::cpu::intel::featureflags::get_or_init - - + 0.01% [k] futex_wake - - + 0.01% [.] ::feed_token - - + 0.01% [.] ::process_at_time - - + 0.01% [.] ::id_to_token - - + 0.01% [k] fdget - - + 0.01% [.] , >::new, tonic::codec::prost::ProstDecoder>::{closure#0}>, >::new, tonic::codec::prost::ProstDecoder>::{closure#1}> as http_body::Body>::poll_frame - - + 0.01% [.] ::consume - - + 0.01% [.] consciousness::locks::record_hold_time - - + 0.01% [.] consciousness::hippocampus::store::index::unpack_uuid_offset_key - - + 0.01% [k] get_futex_key - - + 0.01% [.] ::poll_frame - - + 0.01% [k] plist_add - - + 0.01% [.] ::now - - + 0.01% [k] ep_send_events - - + 0.01% [.] ::reap_orphans - - + 0.01% [.] , h2::proto::streams::prioritize::Prioritized>>>::flush - - + 0.01% [.] ::entry_ranges - - + 0.01% [.] , notify::error::Error>>>::recv - - + 0.01% [.] ::read - - + 0.01% [.] ::poll - - + 0.01% [k] select_task_rq_fair - - + 0.01% [k] xfd_validate_state - - + 0.01% [k] psi_group_change - - + 0.01% [.] ::push - - + 0.01% [.] tokio::runtime::task::waker::drop_waker - - + 0.01% [.] ::field - - + 0.01% [.] ::notify_one_slow - - + 0.01% [.] ::hash_one::<&str> - - + 0.01% [k] __d_lookup_rcu - - + 0.01% [k] _raw_spin_lock - - + 0.01% [k] __futex_wait - - + 0.01% [k] tcp_recvmsg_locked - - + 0.01% [.] >::bulk_push::>, alloc::alloc::Global> - - + 0.01% [k] futex_wait_setup - - + 0.01% [.] ::decode_chain - - + 0.01% [.] >::try_from - - + 0.01% [.] as core::iter::traits::collect::FromIterator<(alloc::string::String, figment::value::value::Value)>>::from_iter::, >>::from::{closure#0}>> - - + 0.01% [.] tokio::runtime::task::waker::wake_by_val - - + 0.01% [k] __rseq_handle_notify_resume - - + 0.01% [.] ::check_and_consume - - + 0.01% [.] ::transition_to_running - - + 0.01% [.] ring_core_0_17_14__CRYPTO_memcmp - - + 0.01% [.] , hyper::proto::h2::SendBuf> as core::future::future::Future>::poll - - + 0.01% [.] ::is_full - - + 0.01% [.] ::wait_until_internal - - + 0.01% [k] do_futex - - + 0.01% [k] x64_sys_call - - + 0.01% [.] ::schedule_task - - + 0.01% [.] >::insert - - + 0.01% [k] place_entity - - + 0.01% [k] __dequeue_entity - - + 0.01% [.] tokio::runtime::task::raw::poll::<::stream_session_mm::{closure#0}, alloc::sync::Arc> - - + 0.01% [.] tokio::runtime::task::waker::clone_waker - - + 0.01% [.] prost::encoding::varint::decode_varint::<&mut &mut tonic::codec::buffer::DecodeBuf> - - + 0.01% [.] ::escape_debug_ext - - + 0.01% [.] ::poll_readiness - - + 0.01% [.] ::merge::<&mut &mut tonic::codec::buffer::DecodeBuf> - - + 0.01% [.] , >::new, tonic::codec::prost::ProstDecoder>::{closure#0}> as http_body::Body>::poll_frame - - + 0.01% [k] reweight_entity - - + 0.01% [k] futex_hash - - + 0.01% [.] ::read - - + 0.01% [.] <&std::os::unix::net::stream::UnixStream as std::io::Read>::read - - + 0.01% [k] igb_xmit_frame_ring - - + 0.01% [k] rcu_sched_clock_irq - - + 0.01% [k] ahci_qc_ncq_fill_rtf - - + 0.01% [k] schedule - - + 0.01% [.] ::run_task - - + 0.01% [.] ::decode_chunk - - + 0.01% [.] >> as std::io::Read>::read - - + 0.01% [k] __perf_event_task_sched_out - - + 0.01% [.] ::current_io_state - - + 0.01% [k] sched_clock_tick - - + 0.01% [.] alloc::vec::in_place_collect::from_iter_in_place::, >>::from::{closure#0}>, figment::value::value::Value> - - + 0.01% [k] ahci_handle_port_interrupt - - + 0.01% [k] native_queued_spin_lock_slowpath - - + 0.01% [k] native_irq_return_iret - - + 0.01% [k] dl_server_update - - + 0.01% [k] futex_wake_mark - - + 0.01% [k] task_mm_cid_work - - + 0.01% [k] native_read_msr - - + 0.01% [k] ep_poll_callback - - + 0.01% [.] ::clone - - + 0.01% [.] pow@@GLIBC_2.29 - - + 0.01% [k] ktime_get_update_offsets_now - - + 0.01% [k] futex_do_wait - - + 0.01% [k] sched_clock - - + 0.01% [k] netdev_core_pick_tx - - + 0.01% [.] ::split_to - - + 0.01% [.] __internal_syscall_cancel - - + 0.01% [.] parking_lot_core::parking_lot::lock_bucket_pair - - + 0.01% [.] ring::aead::aes_gcm::open - - + 0.01% [k] schedule_hrtimeout_range_clock - - + 0.01% [k] exit_to_user_mode_loop - - + 0.01% [.] ::next - - + 0.01% [.] as core::ops::drop::Drop>::drop - - + 0.01% [k] dequeue_entities - - + 0.01% [k] rb_erase - - + 0.01% [.] redb::tree_store::btree_iters::find_iter_left::<&[u8], ()> - - + 0.01% [.] as alloc::vec::spec_from_iter::SpecFromIter<(alloc::string::String, figment::value::value::Value), core::iter::adapters::zip::Zip, alloc::vec::into_iter::IntoIter>>>::from_iter - - + 0.01% [.] >::find_block - - + 0.01% [.] ::deserialize_any::< as serde_core::de::Deserialize>::deserialize::MapVisitor> - - + 0.01% [k] ktime_get - - + 0.01% [k] alloc_fd - - + 0.01% [.] tokio::runtime::task::raw::poll:: + core::marker::Send>>, alloc::sync::Arc> - - + 0.01% [k] rcu_core - - + 0.01% [k] __check_object_size - - + 0.01% [k] sched_clock_cpu - - + 0.01% [.] ::put:: - - + 0.01% [.] ::deserialize_string:: - - + 0.01% [k] hrtimer_start_range_ns - - + 0.01% [k] __dev_queue_xmit - - + 0.01% [k] filp_flush - - + 0.01% [.] ::poll_read - - + 0.01% [.] >::dying_next - - + 0.01% [k] timerqueue_del - - + 0.01% [k] kmem_cache_alloc_node_noprof - - + 0.01% [.] ::wake - - + 0.01% [k] update_curr_dl_se - - + 0.01% [.] ::next:: - - + 0.01% [.] ::next_expiration - - + 0.01% [.] ::sub_timespec - - + 0.01% [.] bytes::bytes_mut::shared_v_drop - - + 0.01% [.] ::decode_chain - - + 0.01% [.] ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1 - - + 0.01% [.] as core::fmt::Write>::write_str - - + 0.00% [k] _find_next_bit - - + 0.00% [k] update_entity_lag - - + 0.00% [k] psi_task_change - - + 0.00% [k] ktime_get_ts64 - - + 0.00% [.] ::new - - + 0.00% [.] ::kind - - + 0.00% [k] bit_putcs - - + 0.00% [.] ::provide:: - - + 0.00% [k] css_rstat_updated - - + 0.00% [.] >::insert - - + 0.00% [.] >::recv::{closure#0}::{closure#0}> as core::future::future::Future>::poll - - + 0.00% [k] rw_verify_area - - + 0.00% [.] >>::remove:: - - + 0.00% [.] serde_json::ser::format_escaped_str_contents::<&mut alloc::vec::Vec, serde_json::ser::CompactFormatter> - - + 0.00% [.] ::next_expiration - - + 0.00% [k] select_estimate_accuracy - - + 0.00% [.] as tonic::codec::Decoder>::decode - - + 0.00% [.] >>::grow_one - - + 0.00% [.] ::drop - - + 0.00% [k] __enqueue_entity - - + 0.00% [.] ::decode - - + 0.00% [.] ring::aead::algorithm::aes_gcm_open - - + 0.00% [k] do_epoll_wait - - + 0.00% [.] ::sub - - + 0.00% [k] perf_ctx_enable - - + 0.00% [k] enqueue_task_fair - - + 0.00% [k] futex_ref_get - - + 0.00% [.] as core::ops::drop::Drop>::drop - - + 0.00% [.] as tokio::runtime::task::Schedule>::schedule - - + 0.00% [k] _raw_spin_lock_irqsave - - + 0.00% [.] as serde_core::de::Deserializer>::deserialize_any::<::deserialize::__Visitor> - - + 0.00% [k] stop_this_handle - - + 0.00% [k] __sys_recvfrom - - + 0.00% [k] perf_ctx_unlock - - + 0.00% [k] bch_alloc_sectors - - + 0.00% [.] rustls::msgs::message::outbound::read_opaque_message_header - - + 0.00% [.] ::parse_value - - + 0.00% [.] ::decrypt_incoming - - + 0.00% [.] syscall - - + 0.00% [k] inet_recvmsg - - + 0.00% [.] ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks - - + 0.00% [k] psi_task_switch - - + 0.00% [k] __pick_eevdf - - + 0.00% [k] dequeue_task_fair - - + 0.00% [k] eventfd_poll - - + 0.00% [.] prost::encoding::merge_loop::::{closure#0}, &mut &mut tonic::codec::buffer::DecodeBuf> - - + 0.00% [.] ::enter - - + 0.00% [k] __put_user_nocheck_4 - - + 0.00% [.] ::clear_expired_reset_streams - - + 0.00% [.] + core::marker::Send>>, alloc::sync::Arc>>::poll - - + 0.00% [.] ::from_bytes_with_nul - - + 0.00% [.] ::poll_read_priv - - + 0.00% [k] tcp_cleanup_rbuf - - + 0.00% [.] ::poll_ready - - + 0.00% [k] selinux_ip_postroute_compat - - + 0.00% [.] ::transition_to_notified_by_val - - + 0.00% [.] , alloc::vec::Vec, ::decode_chain::{closure#0}> as core::iter::traits::iterator::Iterator>::next - - + 0.00% [.] core::ptr::drop_in_place:: - - + 0.00% [k] blkcg_maybe_throttle_current - - + 0.00% [.] as serde_core::de::DeserializeSeed>::deserialize:: - - + 0.00% [k] fsnotify - - + 0.00% [.] ::serialize:: - - + 0.00% [.] as core::ops::drop::Drop>::drop - - + 0.00% [.] ::checked_add - - + 0.00% [k] available_idle_cpu - - + 0.00% [k] merge_sched_in - - + 0.00% [.] ::merged - - + 0.00% [.] ::consume_connection_window - - + 0.00% [.] ::deserialize_any::< as serde_core::de::Deserialize>::deserialize::VecVisitor> - - + 0.00% [.] consciousness::config::config_path - - + 0.00% [k] fsnotify_peek_first_event - - + 0.00% [k] selinux_inode_permission - - + 0.00% [.] statx - - + 0.00% [.] as figment::coalesce::Coalescible>::coalesce - - + 0.00% [k] inotify_poll - - + 0.00% [.] ::reborrow - - + 0.00% [k] idle_cpu - - + 0.00% [k] irq_work_run_list - - + 0.00% [k] __pi_memcpy - - + 0.00% [k] sched_tick - - + 0.00% [k] account_user_time - - + 0.00% [k] hrtimer_interrupt - - + 0.00% [k] acct_account_cputime - - + 0.00% [k] calc_wheel_index - - + 0.00% [.] >::pop:: - - + 0.00% [k] _raw_spin_unlock - - + 0.00% [k] led_trigger_blink_oneshot - - + 0.00% [k] using_native_sched_clock - - + 0.00% [k] rb_insert_color - - + 0.00% [.] alloc_perturb - - + 0.00% [k] ahci_handle_port_intr - - + 0.00% [k] _find_next_and_bit - - + 0.00% [k] irq_enter_rcu - - + 0.00% [k] psi_flags_change - - + 0.00% [k] __note_gp_changes - - + 0.00% [k] tick_nohz_handler - - + 0.00% [.] ::event_loop_thread - - + 0.00% [k] hrtimer_try_to_cancel - - + 0.00% [k] sched_clock_stable - - + 0.00% [k] asm_sysvec_apic_timer_interrupt - - + 0.00% [k] rwb_arm_timer - - + 0.00% [k] rb_next - - + 0.00% [k] irqentry_enter - - + 0.00% [k] __remove_hrtimer - - + 0.00% [k] super_written - - + 0.00% [k] mod_timer - - + 0.00% [k] update_sd_lb_stats.constprop.0 - - + 0.00% [.] , h2::proto::streams::prioritize::Prioritized>> as futures_core::stream::Stream>::poll_next - - + 0.00% [.] , h2::client::Peer>>::clear_expired_reset_streams - - + 0.00% [.] ::maintenance - - + 0.00% [k] __alloc_skb - - + 0.00% [k] __pmu_ctx_sched_out - - + 0.00% [k] igb_poll - - + 0.00% [k] tcp_stream_memory_free - - + 0.00% [.] ::record_data - - + 0.00% [k] netif_skb_features - - + 0.00% [.] ::serialize::<&mut serde_json::ser::Serializer<&mut alloc::vec::Vec>> - - + 0.00% [k] sd_uninit_command - - + 0.00% [.] >::send - - + 0.00% [k] inotify_read - - + 0.00% [k] try_to_wake_up - - + 0.00% [k] lookup_fast - - + 0.00% [.] ::new - - + 0.00% [k] __cgroup_account_cputime - - + 0.00% [k] blk_flush_complete_seq - - + 0.00% [.] ::get_segment - - + 0.00% [.] ::from_capnp - - + 0.00% [k] _raw_spin_rq_lock_irqsave - - + 0.00% [k] native_apic_mem_eoi - - + 0.00% [k] terminate_walk - - + 0.00% [.] >::child_for_key::<&[u8]> - - + 0.00% [.] core::ptr::drop_in_place:: - - + 0.00% [k] scsi_finish_command - - + 0.00% [k] __memcg_slab_free_hook - - + 0.00% [k] ext4_release_file - - + 0.00% [k] scsi_decide_disposition - - + 0.00% [k] update_cfs_group - - + 0.00% [.] consciousness::hippocampus::store::capnp::read_text - - + 0.00% [k] mempool_free_slab - - + 0.00% [.] >::range::<&[u8], core::ops::range::RangeInclusive<&[u8]>> - - + 0.00% [.] std::sys::env::unix::getenv::{closure#0} - - + 0.00% [.] core::slice::sort::shared::pivot::median3_rec::<(alloc::string::String, consciousness::locks::LockStats), <[(alloc::string::String, consciousness::locks::LockStats)]>::sort_by::{closure#0}> - - + 0.00% [k] ext4_wait_block_bitmap - - + 0.00% [.] ::deserialize_any< as serde_core::de::Deserialize>::deserialize::VecVisitor>::{closure#0}> as serde_core::de::SeqAccess>::next_element_seed::> - - + 0.00% [.] ::new - - + 0.00% [.] core::fmt::float::float_to_decimal_common_shortest:: - - + 0.00% [.] getenv - - + 0.00% [.] ::_join - - + 0.00% [k] irqentry_exit_to_user_mode - - + 0.00% [.] as capnp::private::arena::ReaderArena>::check_offset - - + 0.00% [k] ext4_get_inode_loc - - + 0.00% [.] >::insert - - + 0.00% [k] fsnotify_open_perm_and_set_mode - - + 0.00% [.] ::avg_clustering_coefficient - - + 0.00% [k] tomoyo_init_request_info - - + 0.00% [.] ::fmt - - + 0.00% [.] >::pop - - + 0.00% [k] __list_add_valid_or_report - - + 0.00% [.] ::load - - + 0.00% [k] netdev_pick_tx - - + 0.00% [k] ata_qc_complete_multiple - - + 0.00% [k] do_filp_open - - + 0.00% [.] as serde_core::de::Deserializer>::deserialize_any::<::deserialize::__Visitor> - - + 0.00% [.] ::register_by_ref - - + 0.00% [.] ::poll_flush - - + 0.00% [k] dev_gro_receive - - + 0.00% [.] as alloc::vec::spec_from_iter::SpecFromIter<(alloc::string::String, figment::value::value::Value), core::iter::adapters::map::Map, >>::from::{closure#0}>>>::from_iter - - + 0.00% [.] as core::iter::traits::iterator::Iterator>::fold::<(), core::iter::adapters::map::map_fold<(char, isize), char, (), ::transform_range>::{closure#1}, core::iter::traits::iterator::Iterator::for_each::call>::extend, ::transform_range>::{closure#1}>>::{closure#0}>::{closure#0}>::{closure#0}> - - + 0.00% [k] step_into - - + 0.00% [k] refill_stock - - + 0.00% [.] core::ptr::drop_in_place:: - - + 0.00% [k] strncpy_from_user - - + 0.00% [.] ::poll_data - - + 0.00% [.] ::discard - - + 0.00% [.] <&dyn core::fmt::Debug as core::fmt::Debug>::fmt - - + 0.00% [.] <&mut ::merge_all::{closure#0} as core::ops::function::FnMut<((usize, &[tokenizers::models::bpe::word::Symbol]),)>>::call_mut - - + 0.00% [.] >::try_with::, tokio::task::coop::poll_proceed::{closure#0}>::{closure#0}, core::task::poll::Poll> - - + 0.00% [k] pick_task_fair - - + 0.00% [.] std::sys::fs::unix::try_statx - - + 0.00% [k] futex_hash_put - - + 0.00% [k] vfs_read - - + 0.00% [.] ::write_str - - + 0.00% [k] avc_has_perm_noaudit - - + 0.00% [.] as core::fmt::Debug>::fmt - - + 0.00% [k] kmem_cache_alloc_noprof - - + 0.00% [.] ::notify_parked_local - - + 0.00% [.] core::slice::sort::shared::smallsort::insertion_sort_shift_left::<(alloc::string::String, figment::value::value::Value), <[(alloc::string::String, figment::value::value::Value)]>::sort_by< as core::iter::traits::collect::FromIterator<(alloc::string::String, figment::value::value::Value)>>::from_iter, >>::from::{closure#0}>>::{closure#0}>::{closure#0}> - - + 0.00% [.] core::num::imp::flt2dec::strategy::grisu::format_shortest_opt - - + 0.00% [.] ::tokenize - - + 0.00% [k] __smp_call_single_queue - - + 0.00% [k] __fput - - + 0.00% [k] __napi_build_skb - - + 0.00% [k] set_next_buddy - - + 0.00% [k] selinux_file_open - - + 0.00% [k] igb_msix_ring - - + 0.00% [k] copy_from_kernel_nofault - - + 0.00% [.] core::ptr::drop_in_place:: - - + 0.00% [.] >::get:: - - + 0.00% [.] ::end - - + 0.00% [.] ::value_start - - + 0.00% [.] >::reserve::do_reserve_and_handle:: - - + 0.00% [k] bpf_lsm_socket_recvmsg - - + 0.00% [.] >::new - - + 0.00% [.] < as serde_core::de::Deserialize>::deserialize::VecVisitor as serde_core::de::Visitor>::visit_seq::<&mut serde_json::value::de::SeqDeserializer> - - + 0.00% [k] mutex_lock - - + 0.00% [k] __ip_finish_output - - + 0.00% [.] ::filled_mut - - + 0.00% [.] ::is_special_token - - + 0.00% [k] ip_queue_xmit - - + 0.00% [k] simple_copy_to_iter - - + 0.00% [k] tcp_poll - - + 0.00% [.] ::from_utf8_lossy - - + 0.00% [.] unicode_normalization_alignments::lookups::composition_table - - + 0.00% [.] as core::ops::drop::Drop>::drop - - + 0.00% [.] >::dying_next - - + 0.00% [.] ::deserialize:: - - + 0.00% [k] ksys_read - - + 0.00% [k] ip_skb_dst_mtu - - + 0.00% [.] as alloc::vec::spec_from_iter::SpecFromIter<(alloc::string::String, figment::value::value::Value), core::iter::adapters::map::Map, >>::from::{closure#0}>>>::from_iter - - + 0.00% [k] do_mkdirat - - + 0.00% [k] __tcp_transmit_skb - - + 0.00% [k] selinux_ip_output - - + 0.00% [.] ::write - - + 0.00% [.] as alloc::vec::spec_from_iter::SpecFromIter, >::decode::{closure#0}>>>::from_iter - - + 0.00% [.] ::hash_one::<&&core::panic::location::Location> - - + 0.00% [.] __syscall_cancel - - + 0.00% [k] __x64_sys_futex - - + 0.00% [.] consciousness::agent::context::scan_close_tag - - + 0.00% [k] path_get - - + 0.00% [k] dev_hard_start_xmit - - + 0.00% [k] hrtimer_try_to_cancel.part.0 - - + 0.00% [.] , notify::error::Error>>>::recv::{closure#1} - - + 0.00% [k] bpf_lsm_inode_permission - - + 0.00% [.] alloc::fmt::format::format_inner - - + 0.00% [k] add_transaction_credits - - + 0.00% [k] sched_clock_noinstr - - + 0.00% [.] ::debug_struct_field3_finish - - + 0.00% [.] ::deserialize_any::<::deserialize::__Visitor> - - + 0.00% [k] selinux_file_permission - - + 0.00% [.] ::slice::> - - + 0.00% [.] , notify::error::Error>>>::send - - + 0.00% [.] <&std::fs::File as std::io::Write>::write - - + 0.00% [.] >::pop - - + 0.00% [k] __list_del_entry_valid_or_report - - + 0.00% [.] as core::ops::drop::Drop>::drop - - + 0.00% [.] ::consume - - + 0.00% [.] ::nfc - - + 0.00% [k] __put_user_8 - - + 0.00% [.] ::is_contained_in - - + 0.00% [.] ::now - - + 0.00% [.] ::release_capacity - - + 0.00% [k] avg_vruntime - - + 0.00% [.] ::next - - + 0.00% [.] ::end_processing_scheduled_tasks - - + 0.00% [.] ::run::{closure#0}, alloc::sync::Arc>>::poll - - + 0.00% [k] __skb_datagram_iter - - + 0.00% [k] ip_sublist_rcv - - + 0.00% [.] ::write - - + 0.00% [k] file_has_perm - - + 0.00% [.] clock_gettime@@GLIBC_2.17 - - + 0.00% [.] ::pad - - + 0.00% [.] as anyhow::Context<(), capnp::Error>>::with_context::::append_relations::{closure#0}> - - + 0.00% [.] ::key_unchecked - - + 0.00% [.] ::release_connection_capacity - - + 0.00% [k] switch_fpu_return - - + 0.00% [.] ::saturating_duration_since - - + 0.00% [.] >, rustls::client::client_conn::connection::ClientConnection>>::read_io - - + 0.00% [.] ::has_message_ready - - + 0.00% [.] core::ptr::drop_in_place:: - - + 0.00% [.] ::flush - - + 0.00% [k] perf_event_groups_next - - + 0.00% [k] __futex_hash - - + 0.00% [k] sock_recvmsg - - + 0.00% [.] json_five::de::from_str:: - - + 0.00% [.] ::deserialize_any:: - - + 0.00% [k] xas_start - - + 0.00% [.] ::wake - - + 0.00% [.] ::next_match - - + 0.00% [.] match_at - - + 0.00% [k] __tcp_select_window - - + 0.00% [k] tcp_recvmsg - - + 0.00% [k] ext4_ext_insert_extent - - + 0.00% [.] ::wake - - + 0.00% [k] tcp_established_options - - + 0.00% [.] serde_json::value::de::visit_array::< as serde_core::de::Deserialize>::deserialize::VecVisitor> - - + 0.00% [k] __hrtimer_setup - - + 0.00% [k] common_interrupt - - + 0.00% [.] >::add - - + 0.00% [.] ::park_condvar - - + 0.00% [k] ip_send_check - - + 0.00% [k] igb_xmit_frame - - + 0.00% [.] ::neighbors - - + 0.00% [.] _int_realloc - - + 0.00% [k] nf_hook_slow - - + 0.00% [.] <&mio::net::tcp::stream::TcpStream as std::io::Read>::read - - + 0.00% [k] skb_try_coalesce - - + 0.00% [.] ::fmt - - + 0.00% [.] >>::recv_data - - + 0.00% [.] ::finish_grow - - + 0.00% [.] ::parse_next_component_back - - + 0.00% [k] tcp_update_recv_tstamps - - + 0.00% [k] _copy_from_user - - + 0.00% [k] rcu_note_context_switch - - + 0.00% [.] ::try_reserve_exact - - + 0.00% [.] __GI___libc_write - - + 0.00% [k] fdget_pos - - + 0.00% [.] ::send_pending_go_away::, h2::proto::streams::prioritize::Prioritized>> - - + 0.00% [k] folio_mark_accessed - - + 0.00% [.] core::ptr::drop_in_place::> - - + 0.00% [.] ::poll_data - - + 0.00% [.] >::into_owned - - + 0.00% [.] >>> as hyper::rt::io::Write>::poll_flush - - + 0.00% [k] cyc2ns_read_begin - - + 0.00% [k] skb_defer_free_flush - - + 0.00% [.] ::send_pending_pong::, h2::proto::streams::prioritize::Prioritized>> - - + 0.00% [k] kmem_cache_alloc_bulk_noprof - - + 0.00% [k] __ext4_journal_get_write_access - - + 0.00% [.] >>::insert - - + 0.00% [k] net_rx_action - - + 0.00% [.] >, ::pre_tokenize::{closure#1}::{closure#0}> as core::iter::traits::iterator::Iterator>::fold::<(), core::iter::traits::iterator::Iterator::for_each::call<(char, isize), >::extend_trusted>, ::pre_tokenize::{closure#1}::{closure#0}>>::{closure#0}>::{closure#0}> - - + 0.00% [.] > as core::future::future::Future>::poll - - + 0.00% [.] ::drop - - + 0.00% [k] get_nohz_timer_target - - + 0.00% [.] core::ptr::drop_in_place::> - - + 0.00% [.] as rustls::conn::connection::PlaintextSink>::flush - - + 0.00% [.] ::process - - + 0.00% [.] ::register - - + 0.00% [k] blk_attempt_bio_merge.part.0 - - + 0.00% [.] ::open_within - - + 0.00% [k] cyc2ns_read_end - - + 0.00% [.] __libc_recv - - + 0.00% [.] ::park_driver - - + 0.00% [k] ksys_write - - + 0.00% [k] wbc_detach_inode - - + 0.00% [.] bytes::bytes::static_drop - - + 0.00% [k] arch_perf_update_userpage - - + 0.00% [.] ::push_back:: - - + 0.00% [k] lock_sock_nested - - + 0.00% [k] __bitmap_and - - + 0.00% [.] core::ptr::drop_in_place::, alloc::vec::Vec, ::decode_chain::{closure#0}>> - - + 0.00% [.] ::ensure_recv_open - - + 0.00% [k] ip_finish_output2 - - + 0.00% [.] as bytes::buf::buf_impl::Buf>::get_uint - - + 0.00% [.] ::eq - - + 0.00% [k] tcp_recv_timestamp - - + 0.00% [k] futex_wait - - + 0.00% [.] >::try_with::, tokio::task::coop::poll_proceed::{closure#0}>::{closure#0}, core::task::poll::Poll> - - + 0.00% [.] as core::ops::drop::Drop>::drop - - + 0.00% [k] __x64_sys_read - - + 0.00% [.] , alloc::collections::btree::node::marker::KV>>::remove_leaf_kv::<>::remove_kv::{closure#0}, alloc::alloc::Global> - - + 0.00% [k] path_openat - - + 0.00% [k] queue_work_on - - + 0.00% [k] alloc_file_pseudo - - + 0.00% [.] ::add_path - - + 0.00% [.] ::park_timeout - - + 0.00% [k] plist_del - - + 0.00% [k] __slab_free - - + 0.00% [k] wbt_data_dir - - + 0.00% [.] , notify::error::Error>>>::recv - - + 0.00% [k] _find_next_or_bit - - + 0.00% [k] unix_stream_recvmsg - - + 0.00% [k] native_write_msr - - + 0.00% [k] __percpu_counter_sum - - + 0.00% [k] nmi_restore - - + 0.00% [k] calc_timer_values - - + 0.00% [k] pv_native_write_cr2 - - + 0.00% [k] blk_mq_complete_request_remote - - + 0.00% [.] >::recv - - + 0.00% [k] perf_event_update_userpage - - + 0.00% [k] entry_SYSCALL_64_after_hwframe - - + 0.00% [k] exc_nmi - - + 0.00% [k] end_repeat_nmi - - + 0.00% [k] entry_SYSRETQ_unsafe_stack - - + 0.00% [.] __syscall_cancel_arch - - + 0.00% [k] amd_cc_platform_has - - + 0.00% [k] local_clock_noinstr - - + 0.00% [k] perf_event_idx_default - - + 0.00% [k] local_clock - - + 0.00% [k] nmi_handle.part.0 - - + + +# +# (Tip: To see how parallelism changes over time, try: perf report -F time,latency,parallelism --time-quantum=1s) +# diff --git a/sa-schedule-aligned-variation.py b/sa-schedule-aligned-variation.py new file mode 100644 index 0000000..405ee2b --- /dev/null +++ b/sa-schedule-aligned-variation.py @@ -0,0 +1,200 @@ +"""After applying Procrustes alignment to remove known gauge freedoms +(per-head d_h rotation tying Q/K/V/O, per-layer d_ff rotation tying +gate/up/down), measure per-family cos-sim between adjacent layers across +the whole network. + +Runs Procrustes SVDs on GPU for speed. +""" +import argparse +import json +import numpy as np +import torch +from transformers import AutoModelForCausalLM + + +def procrustes_gpu(M): + """Orthogonal R maximizing tr(R M). R = U V^T where M = U Σ V^T. + M on GPU; returns R on GPU.""" + U, _, Vh = torch.linalg.svd(M, full_matrices=False) + return U @ Vh + + +def frob_gpu(x): + return float(torch.linalg.norm(x).item()) + + +def normalize_fro_gpu(x, eps=1e-12): + n = torch.linalg.norm(x) + return x / n.clamp_min(eps) + + +@torch.no_grad() +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", default="Qwen/Qwen3-4B") + ap.add_argument("--out", default="/tmp/sa-aligned-variation.json") + ap.add_argument("--device", default="cuda") + ap.add_argument("--pairs", default="", + help="Comma-separated list of L indices to run pair (L, L+1) for. " + "Empty = all pairs. E.g. '0,20,30,38,46,52,57' samples phases.") + args = ap.parse_args() + + dev = torch.device(args.device) + print(f"Loading {args.model} ...", flush=True) + model = AutoModelForCausalLM.from_pretrained( + args.model, + torch_dtype=torch.float32, + device_map="cpu", + trust_remote_code=True, + attn_implementation="eager", + ) + cfg = model.config + num_layers = cfg.num_hidden_layers + num_heads = cfg.num_attention_heads + num_kv_heads = getattr(cfg, "num_key_value_heads", num_heads) + hidden = cfg.hidden_size + head_dim = getattr(cfg, "head_dim", hidden // num_heads) + intermediate = cfg.intermediate_size + print(f" L={num_layers} H={num_heads} kv={num_kv_heads} hd={head_dim} " + f"hidden={hidden} ff={intermediate}", flush=True) + + # Collect per-layer weights + layers = [] + for L in range(num_layers): + layer = model.model.layers[L] + attn = layer.self_attn + mlp = layer.mlp + layers.append({ + "q_proj": attn.q_proj.weight.detach().float(), + "k_proj": attn.k_proj.weight.detach().float(), + "v_proj": attn.v_proj.weight.detach().float(), + "o_proj": attn.o_proj.weight.detach().float(), + "gate_proj": mlp.gate_proj.weight.detach().float(), + "up_proj": mlp.up_proj.weight.detach().float(), + "down_proj": mlp.down_proj.weight.detach().float(), + }) + del model + + # Per-adjacent-pair analysis + aligned_cos = {fam: {} for fam in + ["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj"]} + + if args.pairs: + pair_L_list = [int(x) for x in args.pairs.split(",")] + else: + pair_L_list = list(range(num_layers - 1)) + + for L in pair_L_list: + A = layers[L] + B = layers[L + 1] + + # -------- Per-head attention alignment (d_h × d_h) -------- + Qa = A["q_proj"].to(dev).reshape(num_heads, head_dim, hidden) + Qb = B["q_proj"].to(dev).reshape(num_heads, head_dim, hidden) + Ka = A["k_proj"].to(dev).reshape(num_kv_heads, head_dim, hidden) + Kb = B["k_proj"].to(dev).reshape(num_kv_heads, head_dim, hidden) + Va = A["v_proj"].to(dev).reshape(num_kv_heads, head_dim, hidden) + Vb = B["v_proj"].to(dev).reshape(num_kv_heads, head_dim, hidden) + # o_proj is (hidden, num_heads*head_dim); split per head + Oa = A["o_proj"].to(dev).reshape(hidden, num_heads, head_dim).permute(1, 0, 2).contiguous() + Ob = B["o_proj"].to(dev).reshape(hidden, num_heads, head_dim).permute(1, 0, 2).contiguous() + # (num_heads, hidden, head_dim) + + q_cos = [] + k_cos = [] + v_cos = [] + o_cos = [] + for h in range(num_heads): + kv_h = (h * num_kv_heads) // num_heads + qa = normalize_fro_gpu(Qa[h]) + qb = normalize_fro_gpu(Qb[h]) + ka = normalize_fro_gpu(Ka[kv_h]) + kb = normalize_fro_gpu(Kb[kv_h]) + va = normalize_fro_gpu(Va[kv_h]) + vb = normalize_fro_gpu(Vb[kv_h]) + oa = normalize_fro_gpu(Oa[h]) + ob = normalize_fro_gpu(Ob[h]) + + # Cross-correlation for joint alignment: we want R s.t. + # R qa ≈ qb (etc), minimize sum of ||R X_a - X_b||² → + # max tr(R M) with M = qa qb^T + ka kb^T + va vb^T + oa^T ob + M = qa @ qb.T + ka @ kb.T + va @ vb.T + oa.T @ ob + R = procrustes_gpu(M) + + # Post-alignment cos-sim (since matrices unit-normalized, cos + # = = tr(qb^T R qa) = tr(R qa qb^T)) + q_cos.append(float(torch.sum(R @ qa * qb).item())) + k_cos.append(float(torch.sum(R @ ka * kb).item())) + v_cos.append(float(torch.sum(R @ va * vb).item())) + # For O: O after rotation = oa R^T; cos = + o_cos.append(float(torch.sum(oa @ R.T * ob).item())) + + aligned_cos["q_proj"][L] = float(np.mean(q_cos)) + aligned_cos["k_proj"][L] = float(np.mean(k_cos)) + aligned_cos["v_proj"][L] = float(np.mean(v_cos)) + aligned_cos["o_proj"][L] = float(np.mean(o_cos)) + + # -------- d_ff × d_ff alignment for gate/up/down -------- + ga = normalize_fro_gpu(A["gate_proj"].to(dev)) + gb = normalize_fro_gpu(B["gate_proj"].to(dev)) + ua = normalize_fro_gpu(A["up_proj"].to(dev)) + ub = normalize_fro_gpu(B["up_proj"].to(dev)) + da = normalize_fro_gpu(A["down_proj"].to(dev)) # (hidden, d_ff) + db = normalize_fro_gpu(B["down_proj"].to(dev)) + + # All of ga, gb, ua, ub are (d_ff, hidden); da, db are (hidden, d_ff) + # Cross-correlation: M = ga gb^T + ua ub^T + da^T db (d_ff × d_ff) + M_ff = ga @ gb.T + ua @ ub.T + da.T @ db + S = procrustes_gpu(M_ff) + + aligned_cos["gate_proj"][L] = float(torch.sum(S @ ga * gb).item()) + aligned_cos["up_proj"][L] = float(torch.sum(S @ ua * ub).item()) + aligned_cos["down_proj"][L] = float(torch.sum(da @ S.T * db).item()) + + # Free GPU memory + del Qa, Qb, Ka, Kb, Va, Vb, Oa, Ob + del ga, gb, ua, ub, da, db, M_ff, S + torch.cuda.empty_cache() + + print(f" done pair L={L}->L+1 " + f"(q={aligned_cos['q_proj'][L]:+.4f} gate={aligned_cos['gate_proj'][L]:+.4f})", + flush=True) + + # Report + print("\n=== Adjacent-layer cos-sim AFTER Procrustes alignment ===\n") + print(" cos=1 means identical after gauge rotation; cos=0 means orthogonal\n") + header = " L " + for fam in aligned_cos: + header += f" {fam:>12}" + print(header) + for L in sorted(pair_L_list): + if L not in aligned_cos["q_proj"]: + continue + row = f" {L:>2}" + for fam in aligned_cos: + row += f" {aligned_cos[fam][L]:+12.4f}" + print(row) + + print("\n=== Per-family summary (aligned) ===") + print(f" {'family':>14} {'mean_cos':>10} {'median_cos':>11} " + f"{'aligned_resid':>14}") + for fam, vals_dict in aligned_cos.items(): + vs = np.array(list(vals_dict.values())) + if len(vs) == 0: + continue + resid = float(np.sqrt(np.maximum(1.0 - vs**2, 0.0)).mean()) + print(f" {fam:>14} {vs.mean():>+10.4f} {np.median(vs):>+11.4f} " + f"{resid:>14.4f}") + + with open(args.out, "w") as f: + json.dump({ + "model": args.model, + "num_layers": num_layers, + "aligned_cos": aligned_cos, + }, f, indent=2) + print(f"\nSaved: {args.out}") + + +if __name__ == "__main__": + main() diff --git a/sa-schedule-analyze-aligned.py b/sa-schedule-analyze-aligned.py new file mode 100644 index 0000000..919de10 --- /dev/null +++ b/sa-schedule-analyze-aligned.py @@ -0,0 +1,157 @@ +"""Analyze aligned_variation output to answer the training-artifact vs +specialization question. + +Inputs: qwen3-*-null.json (raw cos-sim) + qwen3-*-aligned.json (aligned cos-sim) + +For each layer pair where aligned data exists, compare: + raw_cos(L) — before Procrustes alignment + aligned_cos(L) — after Procrustes alignment + delta = aligned_cos - raw_cos + +If delta is substantial (aligned much larger than raw), rotation gauge +was hiding shared structure → training-artifact hypothesis supported. +If delta ≈ 0, specialization is real (rotation can't find shared +structure because there isn't any). + +Stratify by phase to test prediction that LATE layers have LARGER delta +(more rotation-gauge noise, less real specialization). +""" +import argparse +import json +import numpy as np + + +def phase_of(L, num_layers): + """Rough phase assignment based on measured 32B entropy boundaries. + For other models we'd refit — but shape should be similar.""" + if num_layers == 64: # Qwen3-32B + if L <= 6: + return "A" + elif L <= 9: + return "B" + elif L <= 31: + return "C" + elif L <= 46: + return "D" + elif L <= 58: + return "E" + else: + return "tail" + elif num_layers == 36: # Qwen3-4B + if L <= 6: + return "A" + elif L <= 9: + return "B" + elif L <= 23: + return "C" + elif L <= 33: + return "D" + else: + return "tail" + else: + frac = L / num_layers + if frac < 0.11: + return "A" + elif frac < 0.15: + return "B" + elif frac < 0.5: + return "C" + elif frac < 0.75: + return "D" + elif frac < 0.92: + return "E" + else: + return "tail" + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("null_json", help="output of null_residual.py") + ap.add_argument("aligned_json", help="output of aligned_variation.py") + args = ap.parse_args() + + null = json.load(open(args.null_json)) + aligned = json.load(open(args.aligned_json)) + + num_layers = aligned["num_layers"] + aligned_cos = aligned["aligned_cos"] # dict: family -> {L: cos} + pair_results = null["pair_results"] # list of {L, L_next, families: {family: {cos, ...}}} + + # Build raw_cos dict from null output + raw_cos = {fam: {} for fam in ["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj"]} + for pr in pair_results: + L = pr["L"] + for fam in raw_cos: + if fam in pr["families"]: + raw_cos[fam][L] = pr["families"][fam]["cos"] + + print(f"=== Aligned vs Raw cos-sim comparison ({args.aligned_json}) ===") + print(f" {num_layers} layers total; aligned data for " + f"{len(aligned_cos['q_proj'])} pairs\n") + + # Per-pair table: L, phase, family cos-sims raw and aligned + families = ["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj"] + + print(f" {'L':>3} {'phase':>5}", end="") + for fam in families: + print(f" {fam+'_raw':>10} {fam+'_ali':>10}", end="") + print() + + L_keys = sorted([int(L) for L in aligned_cos["q_proj"].keys()]) + for L in L_keys: + Lstr = str(L) + phase = phase_of(L, num_layers) + row = f" {L:>3} {phase:>5}" + for fam in families: + r = raw_cos[fam].get(L, None) + a = aligned_cos[fam].get(Lstr, None) + rstr = f"{r:+10.4f}" if r is not None else " N/A" + astr = f"{a:+10.4f}" if a is not None else " N/A" + row += f" {rstr} {astr}" + print(row) + + # Aggregate by phase: mean (aligned - raw) per family per phase + print("\n=== Per-phase mean delta (aligned_cos - raw_cos) by family ===") + print(f" Large positive delta = rotation alignment revealed shared") + print(f" structure. Small delta = specialization is gauge-independent.\n") + + phase_deltas = {} + for L in L_keys: + Lstr = str(L) + ph = phase_of(L, num_layers) + for fam in families: + r = raw_cos[fam].get(L, None) + a = aligned_cos[fam].get(Lstr, None) + if r is not None and a is not None: + phase_deltas.setdefault(ph, {}).setdefault(fam, []).append(a - r) + + print(f" {'phase':>6}", end="") + for fam in families: + print(f" {fam:>10}", end="") + print() + for ph in sorted(phase_deltas.keys()): + print(f" {ph:>6}", end="") + for fam in families: + vals = phase_deltas[ph].get(fam, []) + if vals: + print(f" {np.mean(vals):+10.4f}", end="") + else: + print(f" {'—':>10}", end="") + print() + + # Interpretation + print("\n=== Interpretation ===") + print(" Prediction under training-artifact hypothesis:") + print(" delta(Phase E) > delta(Phase C) for projection families") + print(" → late layers have more rotation-gauge-hidden structure") + print(" → specialization is partly training noise, not structural") + print("") + print(" Prediction under real-specialization hypothesis:") + print(" delta ~ 0 everywhere") + print(" → layers genuinely point in different directions, gauge irrelevant") + + +if __name__ == "__main__": + main() diff --git a/sa-schedule-analyze-grams.py b/sa-schedule-analyze-grams.py new file mode 100644 index 0000000..b4cdc4e --- /dev/null +++ b/sa-schedule-analyze-grams.py @@ -0,0 +1,168 @@ +"""Analyze operator-level inter-layer alignment from the grams + eigdirs files. + +Input: + qwen3-4b-grams.json (gram[L,L',h], fro_sq[L,h]) + qwen3-4b-grams-eigdirs.pt (eig_dirs[L,h,topk,hidden], sym_eigs[L,h,2*head_dim]) + +Questions: + (a) Operator cos-sim between layers. cos(g_L^h, g_L'^h) = gram / √(fro_sq fro_sq'). + If ~1 → same operator up to scalar. If low → distinct operators. + (b) Scalar-rescale residual using full operator (not spectrum): + optimal T = gram / fro_sq', residual_frac = √(1 - cos²). + (c) Curvature-sign alignment. For each (L, anchor) pair, what fraction of + top-k signed eigenvalues share sign with the anchor's? + (d) Top-k eigensubspace alignment. Principal angles between span{eig_dirs_L} + and span{eig_dirs_anchor}. + + Compare: operator cos-sim vs spectral cos-sim (from prior analysis). The + sheaf-rs finding was that spectral shape converges across layers while + eigenvectors don't. We want to confirm/refute that within QK in Qwen3-4B. +""" +import argparse +import json +import numpy as np +import torch + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("gram_json") + ap.add_argument("--anchor", type=int, default=-1, + help="anchor layer index; -1 = last") + args = ap.parse_args() + + with open(args.gram_json) as f: + d = json.load(f) + num_layers = d["num_layers"] + num_heads = d["num_heads"] + head_dim = d["head_dim"] + hidden = d["hidden_size"] + gram = np.array(d["gram"]) # (L, L, H) + # NOTE: fro_sq from the json is ||W_K W_Q^T||_F^2 (the measure.py + # shortcut), NOT ||g||_F^2 = ||W_K^T W_Q||_F^2 which is what the gram + # diagonal gives. Different objects. Use gram diagonal for normalization. + diag_sq = np.array([[gram[L, L, h] for h in range(num_heads)] + for L in range(num_layers)]) # (L, H) + diag = np.sqrt(np.maximum(diag_sq, 1e-20)) # ||g_L^h||_F + + pt = torch.load(d.get("eigdirs_path", args.gram_json.replace(".json", "-eigdirs.pt")), + weights_only=True) + eig_dirs = pt["eig_dirs"].double().numpy() # (L, H, topk, hidden) + sym_eigs = pt["sym_eigs"].double().numpy() # (L, H, 2*head_dim) + topk = eig_dirs.shape[2] + anchor = args.anchor if args.anchor >= 0 else num_layers - 1 + + # ========================================================== + # (a) Operator cos-sim matrix, averaged over heads + # ========================================================== + cos_mat = np.zeros((num_layers, num_layers)) + for L in range(num_layers): + for Lp in range(num_layers): + denom = diag[L] * diag[Lp] + per_h = gram[L, Lp] / np.maximum(denom, 1e-20) + cos_mat[L, Lp] = per_h.mean() + + print(f"=== (a) Operator cos-sim between layers, averaged over {num_heads} heads ===") + print(f" diagonal (should be 1.0): mean {np.diag(cos_mat).mean():.4f}") + # Adjacent-layer cos-sim + adj = np.array([cos_mat[L, L+1] for L in range(num_layers-1)]) + print(f" adjacent layers cos-sim: mean {adj.mean():.4f} min {adj.min():.4f} max {adj.max():.4f}") + # Layer-to-anchor cos-sim + to_anchor = cos_mat[:, anchor] + print(f" layer -> anchor L={anchor} cos-sim:") + print(f" {'L':>3} {'cos':>7} {'T_opt':>7} {'resid_frac':>10}") + for L in range(num_layers): + c = to_anchor[L] + T = float(np.mean(gram[L, anchor] / np.maximum(diag_sq[anchor], 1e-20))) + r = float(np.sqrt(max(1.0 - c**2, 0.0))) + print(f" {L:>3} {c:+.4f} {T:+7.3f} {r:>10.4f}") + + # Long-range cos-sim (L=0 to L=35 vs L=17 to L=35 etc.) + print(f"\n long-range: cos(L=0, last) = {cos_mat[0, -1]:+.3f} " + f"cos(L=midish, last) = {cos_mat[num_layers//2, -1]:+.3f}") + + # ========================================================== + # (b) Full scalar-rescale residual using the gram + # ========================================================== + print(f"\n=== (b) Operator-level scalar rescale to anchor L={anchor} ===") + # residual_frac² = 1 - cos²(g_L, g_anchor) (per head) + print(f" {'L':>3} {'mean_cos':>9} {'mean_resid':>10}") + for L in range(num_layers): + per_h_cos = gram[L, anchor] / np.maximum(diag[L] * diag[anchor], 1e-20) + per_h_resid = np.sqrt(np.clip(1.0 - per_h_cos**2, 0.0, 1.0)) + print(f" {L:>3} {per_h_cos.mean():>+9.4f} {per_h_resid.mean():>10.4f}") + + # ========================================================== + # (c) Curvature-sign alignment + # ========================================================== + print(f"\n=== (c) Curvature-sign alignment vs anchor L={anchor} ===") + # Look at top-k eigenvalues by magnitude (already sorted that way in measure). + # Fraction of top-k (L, h) whose sign matches the anchor's i-th eigenvalue. + for k_use in [2, 4, 8, 16, 32, 64, 128, 256]: + if k_use > sym_eigs.shape[-1]: + continue + # sign of top-k_use eigenvalues at layer L vs at anchor, per (L, h) + sign_L = np.sign(sym_eigs[:, :, :k_use]) # (L, H, k_use) + sign_a = np.sign(sym_eigs[anchor, :, :k_use]) # (H, k_use) + agree = (sign_L == sign_a[None, :, :]).mean(axis=-1) # (L, H) + print(f" top-{k_use:>3} signs: mean agree = {agree.mean():.3f} " + f"by layer range: early {agree[:12].mean():.3f} " + f"mid {agree[12:24].mean():.3f} late {agree[24:].mean():.3f}") + + # Also: distribution of sign-balance per layer (fraction positive eigenvalues) + frac_pos = (sym_eigs[:, :, :2 * head_dim] > 0).mean(axis=(1, 2)) + print(f"\n fraction positive eigenvalues per layer:") + for L in range(num_layers): + print(f" L={L:2} frac+ = {frac_pos[L]:.3f}") + + # ========================================================== + # (d) Eigenspace principal angles + # ========================================================== + print(f"\n=== (d) Top-{topk} eigensubspace principal angles vs anchor L={anchor} ===") + # Per-head: cos of principal angles between row-spans of eig_dirs[L, h] + # and eig_dirs[anchor, h]. Report mean cos angle per layer. + print(f" {'L':>3} {'meanCosPA':>10} {'minCosPA':>10} {'max_top1':>10}") + for L in range(num_layers): + mean_cos_pa_per_h = [] + min_cos_pa_per_h = [] + top1_overlap = [] + for h in range(num_heads): + A = eig_dirs[L, h] # (topk, hidden) rows are unit vectors + B = eig_dirs[anchor, h] # (topk, hidden) + # Orthonormalize rows (they're close-to-orthonormal but not exactly) + Qa, _ = np.linalg.qr(A.T) # hidden × topk + Qb, _ = np.linalg.qr(B.T) + M = Qa.T @ Qb # topk × topk + s = np.linalg.svd(M, compute_uv=False) + mean_cos_pa_per_h.append(s.mean()) + min_cos_pa_per_h.append(s.min()) + # ||² — top-1 eigenvector overlap + top1_overlap.append(float((A[0] @ B[0]) ** 2)) + print(f" {L:>3} {np.mean(mean_cos_pa_per_h):>10.4f} " + f"{np.mean(min_cos_pa_per_h):>10.4f} " + f"{np.mean(top1_overlap):>10.4f}") + + # ========================================================== + # Verdict + # ========================================================== + to_anchor_per_head = np.array([ + (gram[L, anchor] / np.maximum(diag[L] * diag[anchor], 1e-20)).mean() + for L in range(num_layers) + ]) + mean_cos_to_anchor = to_anchor_per_head.mean() + print(f"\n=== Verdict ===") + print(f" mean operator cos-sim to anchor: {mean_cos_to_anchor:+.4f}") + adj_mean = adj.mean() + print(f" mean operator cos-sim adjacent layers: {adj_mean:+.4f}") + if mean_cos_to_anchor > 0.9: + print(" STRONG: same operator up to scalar across all layers.") + elif mean_cos_to_anchor > 0.5: + print(" MEDIUM: substantial shared operator, but layer-specific drift.") + elif mean_cos_to_anchor > 0.1: + print(" WEAK: some alignment; far from single-operator interpretation.") + else: + print(" REJECTED: operators are effectively orthogonal across layers.") + + +if __name__ == "__main__": + main() diff --git a/sa-schedule-analyze.py b/sa-schedule-analyze.py new file mode 100644 index 0000000..65284f3 --- /dev/null +++ b/sa-schedule-analyze.py @@ -0,0 +1,108 @@ +"""Analyze the SA schedule readout JSON: per-head variance, static/dynamic +correlation, and a plot.""" +import argparse +import json +import numpy as np +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("input_json") + ap.add_argument("--out-plot", default="/tmp/sa-schedule.png") + args = ap.parse_args() + + with open(args.input_json) as f: + data = json.load(f) + + num_layers = data["num_layers"] + num_heads = data["num_heads"] + Ls = np.arange(num_layers) + + ent = np.array([row["mean_attention_entropy_per_head"] for row in data["dynamic"]]) # (L, H) + logit_std = np.array([row["mean_logit_std_per_head"] for row in data["dynamic"]]) # (L, H) + metric_op = np.array([row["metric_op_per_head"] for row in data["static"]]) # (L, H) + metric_fro = np.array([row["metric_fro_per_head"] for row in data["static"]]) + + mean_ent = ent.mean(axis=1) + std_ent = ent.std(axis=1) + mean_logit = logit_std.mean(axis=1) + std_logit = logit_std.std(axis=1) + mean_metric = metric_op.mean(axis=1) + std_metric = metric_op.std(axis=1) + + # Per-head variance summary + print("\nPer-head variance across heads (coefficient of variation = std/mean):") + print(f" entropy: mean CV = {(std_ent / np.maximum(mean_ent, 1e-6)).mean():.3f}") + print(f" logit_std: mean CV = {(std_logit / np.maximum(mean_logit, 1e-6)).mean():.3f}") + print(f" metric_op: mean CV = {(std_metric / np.maximum(mean_metric, 1e-6)).mean():.3f}") + + # Correlations across layers + corr_ent_metric = np.corrcoef(mean_ent, mean_metric)[0, 1] + corr_logit_metric = np.corrcoef(mean_logit, mean_metric)[0, 1] + corr_ent_logit = np.corrcoef(mean_ent, mean_logit)[0, 1] + print("\nAcross-layer Pearson correlations (averaged over heads):") + print(f" entropy vs metric_op: {corr_ent_metric:+.3f}") + print(f" logit_std vs metric_op: {corr_logit_metric:+.3f}") + print(f" entropy vs logit_std: {corr_ent_logit:+.3f}") + + # Per-head correlation (one value per head): does each head's entropy + # across layers track its own metric_op across layers? + head_corrs = [] + for h in range(num_heads): + c = np.corrcoef(ent[:, h], metric_op[:, h])[0, 1] + if np.isfinite(c): + head_corrs.append(c) + print(f" per-head entropy vs metric_op: mean {np.mean(head_corrs):+.3f} " + f"std {np.std(head_corrs):.3f} min {min(head_corrs):+.3f} max {max(head_corrs):+.3f}") + + # Plot + fig, axes = plt.subplots(3, 1, figsize=(10, 9), sharex=True) + + ax = axes[0] + ax.fill_between(Ls, mean_ent - std_ent, mean_ent + std_ent, alpha=0.2, color="tab:blue", + label="±1 std across heads") + ax.plot(Ls, mean_ent, color="tab:blue", marker="o", label="mean entropy") + ax.set_ylabel("attention entropy (nats)") + ax.set_title(f"{data['model']} — SA schedule readout ({num_layers} layers, {num_heads} heads)") + ax.legend(loc="upper right") + ax.grid(alpha=0.3) + + ax = axes[1] + ax.fill_between(Ls, mean_logit - std_logit, mean_logit + std_logit, alpha=0.2, color="tab:orange", + label="±1 std across heads") + ax.plot(Ls, mean_logit, color="tab:orange", marker="o", label="mean logit std") + ax.set_ylabel("pre-softmax logit std\n(= implicit sharpness)") + ax.legend(loc="upper right") + ax.grid(alpha=0.3) + + ax = axes[2] + ax.fill_between(Ls, mean_metric - std_metric, mean_metric + std_metric, alpha=0.2, color="tab:green", + label="±1 std across heads") + ax.plot(Ls, mean_metric, color="tab:green", marker="o", label="mean metric op-norm (static)") + ax.set_ylabel("||W_K^T W_Q|| operator norm\n(static, parameter-only)") + ax.set_xlabel("layer index L") + ax.legend(loc="upper right") + ax.grid(alpha=0.3) + + plt.tight_layout() + plt.savefig(args.out_plot, dpi=100, bbox_inches="tight") + print(f"\nWrote plot to {args.out_plot}") + + # Also save a small heatmap of per-head entropy for visual spread + plt.figure(figsize=(10, 6)) + plt.imshow(ent.T, aspect="auto", cmap="viridis", origin="lower") + plt.colorbar(label="attention entropy") + plt.xlabel("layer L") + plt.ylabel("head h") + plt.title(f"{data['model']} — per-head entropy heatmap") + heatmap_path = args.out_plot.replace(".png", "-heatmap.png") + plt.tight_layout() + plt.savefig(heatmap_path, dpi=100, bbox_inches="tight") + print(f"Wrote heatmap to {heatmap_path}") + + +if __name__ == "__main__": + main() diff --git a/sa-schedule-delta-svd.py b/sa-schedule-delta-svd.py new file mode 100644 index 0000000..3d161c9 --- /dev/null +++ b/sa-schedule-delta-svd.py @@ -0,0 +1,234 @@ +"""Per-layer residual-stream delta SVD: δ_L = h_{L+1} - h_L stacked +over all tokens in a calibration set. SVD gives us: + + - top singular value per layer → γ_L (scalar magnitude, what Kirkpatrick fit) + - top right-singular-vector per layer → v_L (direction in hidden space) + - effective rank per layer → is this one direction or many? + - pairwise v_L cos-sim across layers → are layers subspace-disjoint or -shared? + +This directly tests the anisotropic-SA hypothesis: + h_{L+1} = h_L + T_shared(h_L) + γ_L · v_L · f(...) + +Phase C prediction: v_L vectors cover broad shared subspace (high mutual cos-sim, +rank-few overall), δ_L is mostly noise around a shared update. +Phase E prediction: v_L vectors are specialized (low pairwise cos-sim, each layer +its own direction), effective rank of the block is close to N. + +Qwen3-32B phases: A 0-6, B 7-9, C 10-31, D 32-46, E 47-58, tail 59-63. +""" +import argparse +import json +import numpy as np +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + + +CALIB = [ + "The Eiffel Tower is located in", + "Photosynthesis is the process by which", + "The three branches of the US government are the legislative, executive, and", + "If a train travels 60 miles per hour for 2.5 hours, the total distance covered is", + "Solve for x: 3x + 7 = 22. The answer is x =", + "The derivative of x^3 + 2x^2 is", + "def fibonacci(n):\n if n < 2:\n return n\n return", + "# Python list comprehension to square even numbers in 0-9\nresult = ", + "SELECT name, age FROM users WHERE", + "She opened the old wooden box and found", + "The argument in favor of renewable energy is", + "User: What is the capital of Australia?\nAssistant:", + "Write a haiku about autumn:\n", + "Albert Einstein was born in the year", + "The speed of light in vacuum is approximately", + "I really loved that movie because", + "The main difference between a virus and a bacterium is", + "The French word for 'apple' is", + "1 + 1 = ", + "Once upon a time, in a land far away,", + "The key insight of general relativity is that gravity is not a force but", + "Water boils at 100 degrees Celsius at standard atmospheric pressure. At higher", + "In object-oriented programming, encapsulation refers to", + "The mitochondria is often called the powerhouse of the cell because it", + "Shakespeare's Hamlet begins with the famous line", +] + + +def phase_of(L, num_layers): + if num_layers == 64: + if L <= 6: return "A" + if L <= 9: return "B" + if L <= 31: return "C" + if L <= 46: return "D" + if L <= 58: return "E" + return "tail" + frac = L / num_layers + if frac < 0.11: return "A" + if frac < 0.15: return "B" + if frac < 0.5: return "C" + if frac < 0.75: return "D" + if frac < 0.92: return "E" + return "tail" + + +@torch.no_grad() +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", default="Qwen/Qwen3-32B") + ap.add_argument("--out", default="/tmp/delta-svd.json") + ap.add_argument("--top-k", type=int, default=8, + help="keep top-k singular values / directions per layer") + args = ap.parse_args() + + print(f"Loading {args.model} ...", flush=True) + tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + args.model, torch_dtype=torch.bfloat16, device_map="cuda", + trust_remote_code=True, attn_implementation="eager", + ).eval() + num_layers = model.config.num_hidden_layers + hidden = model.config.hidden_size + print(f" L={num_layers}, hidden={hidden}", flush=True) + + # Concat calib and tokenize as one stream + text = "\n\n".join(CALIB) + enc = tok(text, return_tensors="pt", truncation=True, max_length=2048).to("cuda") + n_tok = enc.input_ids.shape[1] + print(f" calibration tokens: {n_tok}", flush=True) + + out = model(**enc, output_hidden_states=True, use_cache=False) + # hidden_states: tuple of (num_layers+1) tensors, each (1, n_tok, hidden) + hs = [h[0].float().cpu().numpy() for h in out.hidden_states] + # hs[L] = residual stream entering layer L (or leaving layer L-1). So + # δ_L = hs[L+1] - hs[L] is layer L's contribution. + print(f" hidden_states count: {len(hs)} (expect {num_layers+1})", flush=True) + del model, out + torch.cuda.empty_cache() + + # Per-layer SVD + per_layer = [] + for L in range(num_layers): + delta = hs[L+1] - hs[L] # (n_tok, hidden) + h_in = hs[L] # (n_tok, hidden) + # Remove BOS / first-token artifacts (often outlier) + delta = delta[1:] + h_in = h_in[1:] + n, d = delta.shape + + # Norm per token + token_norms = np.linalg.norm(delta, axis=1) # (n,) + h_norms = np.linalg.norm(h_in, axis=1) # (n,) + # Relative step size: ||δ_L|| / ||h_L|| + rel_step = (token_norms / np.maximum(h_norms, 1e-8)) + # Angle between δ and h, per token: cos = <δ, h> / (||δ||||h||) + dot = np.einsum("nd,nd->n", delta, h_in) + cos_delta_h = dot / np.maximum(token_norms * h_norms, 1e-8) + # "Parallel" component: how much of δ points along ±h + parallel_frac = np.abs(cos_delta_h).mean() + + # SVD in economy mode (on CPU; 2047x5120 fits easy) + U, S, Vt = np.linalg.svd(delta, full_matrices=False) + # S: singular values, decreasing. Vt: right singular vectors (directions). + + # Effective rank (entropy of normalized squared SVs) + p = S**2 / (S**2).sum() + p_nz = p[p > 1e-12] + eff_rank = float(np.exp(-(p_nz * np.log(p_nz)).sum())) + + # Energy concentration + top1_frac = float(p[0]) + top3_frac = float(p[:3].sum()) + top10_frac = float(p[:min(10, len(p))].sum()) + + per_layer.append({ + "L": L, + "phase": phase_of(L, num_layers), + "frob": float(np.linalg.norm(delta)), + "token_norm_mean": float(token_norms.mean()), + "token_norm_std": float(token_norms.std()), + "h_norm_mean": float(h_norms.mean()), + "rel_step_mean": float(rel_step.mean()), + "rel_step_std": float(rel_step.std()), + "parallel_frac": float(parallel_frac), + "cos_delta_h_mean": float(cos_delta_h.mean()), + "top_singvals": S[:args.top_k].tolist(), + "top_dirs": Vt[:args.top_k].astype(np.float32).tolist(), + "eff_rank": eff_rank, + "top1_frac": top1_frac, + "top3_frac": top3_frac, + "top10_frac": top10_frac, + }) + print(f" L={L:>2} phase={phase_of(L, num_layers):>4} " + f"||h||={h_norms.mean():>7.1f} " + f"||δ||={token_norms.mean():>7.2f} " + f"rel={rel_step.mean():.4f} " + f"‖parallel‖={parallel_frac:.4f} " + f"eff_rank={eff_rank:>6.2f}", + flush=True) + + # Pairwise cos-sim of top-1 directions across layers + top1_dirs = np.array([pl["top_dirs"][0] for pl in per_layer]) # (L, d) + top1_cos = top1_dirs @ top1_dirs.T # (L, L) + + # Subspace principal angles: project each layer's top-k into others' span + print(f"\n=== Pairwise top-1 cos-sim (adjacent) ===") + for L in range(num_layers - 1): + print(f" L={L:>2}→{L+1:>2} phase={phase_of(L, num_layers):>4} " + f"|cos|={abs(top1_cos[L, L+1]):>.4f}") + + # Per-phase summary: mean |cos| within phase vs cross-phase + phase_members = {} + for L in range(num_layers): + phase_members.setdefault(phase_of(L, num_layers), []).append(L) + + print(f"\n=== Per-phase top-1 direction overlap ===") + print(f" {'phase':>6} {'N':>3} {'intra_cos_mean':>14} {'cross_cos_mean':>14}") + for ph, Ls in phase_members.items(): + intra = abs(top1_cos[np.ix_(Ls, Ls)]) + if len(Ls) >= 2: + intra_vals = intra[np.triu_indices(len(Ls), k=1)] + intra_mean = float(intra_vals.mean()) + else: + intra_mean = 1.0 + other_Ls = [L for L in range(num_layers) if L not in Ls] + if other_Ls: + cross = abs(top1_cos[np.ix_(Ls, other_Ls)]) + cross_mean = float(cross.mean()) + else: + cross_mean = 0.0 + print(f" {ph:>6} {len(Ls):>3} {intra_mean:>14.4f} {cross_mean:>14.4f}") + + # Subspace overlap: for each phase, find the block's overall principal subspace + # and measure how much of each individual layer sits in it. + print(f"\n=== Block-shared subspace (rank-8) capture fraction per layer ===") + for ph, Ls in phase_members.items(): + if len(Ls) < 2: + continue + # Stack top-k directions from all layers in phase + block_dirs = np.concatenate([per_layer[L]["top_dirs"] for L in Ls], axis=0) + # SVD to get the shared basis of the union + U_b, S_b, Vt_b = np.linalg.svd(block_dirs, full_matrices=False) + shared_basis = Vt_b[:8] # top-8 shared directions of the block's top-k union + # Project each layer's top-1 direction and measure capture + for L in Ls: + v1 = np.array(per_layer[L]["top_dirs"][0]) + capture = float((shared_basis @ v1).__pow__(2).sum()) + print(f" phase={ph:>4} L={L:>2} v1 captured by block top-8: {capture:.4f}") + + # Save + save = { + "model": args.model, + "num_layers": num_layers, + "hidden": hidden, + "n_calib_tokens": int(n_tok), + "per_layer": [ + {k: v for k, v in pl.items() if k != "top_dirs"} # directions too big + for pl in per_layer + ], + "top1_cos_adjacent": [float(top1_cos[L, L+1]) for L in range(num_layers-1)], + } + with open(args.out, "w") as f: + json.dump(save, f, indent=2) + print(f"\nSaved: {args.out}") + + +if __name__ == "__main__": + main() diff --git a/sa-schedule-derive-from-last.py b/sa-schedule-derive-from-last.py new file mode 100644 index 0000000..c1571fa --- /dev/null +++ b/sa-schedule-derive-from-last.py @@ -0,0 +1,214 @@ +"""Under the SA-schedule hypothesis, earlier layers should be approximately +a temperature-rescaled version of a shared operator. The simplest test: +pick the last layer's per-head metric spectrum as anchor, and ask whether +earlier layers' spectra are scalar rescales of it. + +Three experiments on the existing per-head singular values: + + (1) Spectral shape invariance. For each head h, normalize σ_L^h by σ_max + and compare the shape vector across layers. If shapes match, scale is + the only free parameter. + + (2) Scalar rescale fit. For each (L, h), find T_L^h minimizing + ||σ_L^h - T_L^h σ_last^h||². Optimal T_L^h = <σ_L^h, σ_last^h>/||σ_last^h||². + Report residual = ||σ_L^h - T_L^h σ_last^h|| / ||σ_L^h||. + + (3) Cross-head sharing. If the *shape* is the same across heads too (not + just across layers), we could use a single anchor per *layer* (last + layer, one head) and reconstruct everything. Report mean shape + correlation across heads within a layer. + +The anchor doesn't have to be the last layer — we also try: last layer, +middle layer, per-layer-group best match. Purpose is not to pick the best +anchor but to understand which choice lets reconstruction succeed. +""" +import argparse +import json +import numpy as np + + +def pad_to(arr, n): + """Pad a 1D array to length n with zeros (for heads of different rank).""" + if arr.shape[0] == n: + return arr + out = np.zeros(n, dtype=arr.dtype) + out[:arr.shape[0]] = arr + return out + + +def collect_spectra(data): + """Return array sigma[L, h, k] of singular values, padded.""" + num_layers = data["num_layers"] + num_heads = data["num_heads"] + # Determine max rank across all heads + max_k = 0 + for row in data["static"]: + for s in row["metric_singvals_per_head"]: + max_k = max(max_k, len(s)) + sigma = np.zeros((num_layers, num_heads, max_k), dtype=np.float64) + for L, row in enumerate(data["static"]): + for h, s in enumerate(row["metric_singvals_per_head"]): + sigma[L, h, :len(s)] = s + return sigma + + +def scalar_rescale_fit(x, y): + """Optimal scalar T s.t. ||x - T y|| is minimized. + Returns (T, residual_frac) where residual_frac = ||x - T y|| / ||x||. + """ + denom = float((y * y).sum()) + if denom < 1e-20: + return 0.0, 1.0 + T = float((x * y).sum() / denom) + resid = x - T * y + rn = float(np.linalg.norm(resid)) + xn = float(np.linalg.norm(x)) + return T, (rn / xn if xn > 1e-20 else 0.0) + + +def cos_sim(x, y): + xn = float(np.linalg.norm(x)) + yn = float(np.linalg.norm(y)) + if xn < 1e-20 or yn < 1e-20: + return 0.0 + return float((x * y).sum() / (xn * yn)) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("input_json") + ap.add_argument("--anchor", choices=["last", "middle", "best"], default="last") + args = ap.parse_args() + + with open(args.input_json) as f: + data = json.load(f) + + num_layers = data["num_layers"] + num_heads = data["num_heads"] + sigma = collect_spectra(data) # (L, H, K) + print(f"Loaded sigma: shape {sigma.shape}, max rank {sigma.shape[-1]}") + + # ------------------------------------------------------------------ + # Experiment 1: spectral shape invariance across layers (per head) + # ------------------------------------------------------------------ + print("\n=== (1) Spectral shape invariance across layers ===") + # For each head, compute normalized shape σ / σ_max per layer; measure + # mean pairwise cosine similarity of shapes across layers. + shape = np.zeros_like(sigma) + for L in range(num_layers): + for h in range(num_heads): + s = sigma[L, h] + mx = s.max() + shape[L, h] = s / mx if mx > 1e-20 else s + + per_head_cos = np.zeros(num_heads) + for h in range(num_heads): + cs = [] + for L1 in range(num_layers): + for L2 in range(L1 + 1, num_layers): + cs.append(cos_sim(shape[L1, h], shape[L2, h])) + per_head_cos[h] = np.mean(cs) + print(f" per-head mean pairwise cosine of shape across layers:") + print(f" mean {per_head_cos.mean():.4f} std {per_head_cos.std():.4f} " + f"min {per_head_cos.min():.4f} max {per_head_cos.max():.4f}") + # If mean > ~0.99 → shapes identical, pure scalar rescale works + # If mean ~ 0.85-0.95 → close but structure changes layer-to-layer + # If mean < 0.8 → shape varies meaningfully, scalar rescale insufficient + + # ------------------------------------------------------------------ + # Experiment 2: scalar rescale fit to an anchor layer + # ------------------------------------------------------------------ + if args.anchor == "last": + anchor_L = num_layers - 1 + elif args.anchor == "middle": + anchor_L = num_layers // 2 + else: # best: pick layer whose shape is most typical (highest mean cos + # to all other layers) + best_score = -1.0 + anchor_L = num_layers - 1 + for Lc in range(num_layers): + score = 0.0 + for h in range(num_heads): + for L in range(num_layers): + if L == Lc: + continue + score += cos_sim(shape[Lc, h], shape[L, h]) + if score > best_score: + best_score = score + anchor_L = Lc + print(f" [auto-anchor] best layer by total shape-cosine: L={anchor_L}") + + print(f"\n=== (2) Scalar rescale fit to anchor L={anchor_L} ===") + T_map = np.zeros((num_layers, num_heads)) + resid_map = np.zeros((num_layers, num_heads)) + for L in range(num_layers): + for h in range(num_heads): + T, r = scalar_rescale_fit(sigma[L, h], sigma[anchor_L, h]) + T_map[L, h] = T + resid_map[L, h] = r + + # Per-layer residual stats + print(f" per-layer residual fraction ||σ_L^h - T σ_anchor^h|| / ||σ_L^h||:") + print(f" {'L':>3} {'mean resid':>10} {'max resid':>10} {'mean T':>8}") + for L in range(num_layers): + rl = resid_map[L] + tl = T_map[L] + print(f" {L:>3} {rl.mean():>10.4f} {rl.max():>10.4f} {tl.mean():>8.3f}") + + print(f"\n overall mean residual: {resid_map.mean():.4f}") + print(f" overall max residual: {resid_map.max():.4f}") + print(f" frac of (L,h) with resid < 0.10: " + f"{(resid_map < 0.10).mean():.3f}") + print(f" frac of (L,h) with resid < 0.20: " + f"{(resid_map < 0.20).mean():.3f}") + + # ------------------------------------------------------------------ + # Experiment 2b: does T match per-head dynamic entropy? + # ------------------------------------------------------------------ + ent = np.array([row["mean_attention_entropy_per_head"] + for row in data["dynamic"]]) # (L, H) + # T is a scalar temperature of the metric. Geometrically, higher T means + # sharper attention (smaller entropy). So corr(T, entropy) should be negative + # if the scalar rescale captures the temperature schedule. + from numpy import corrcoef + c = float(corrcoef(T_map.flatten(), ent.flatten())[0, 1]) + print(f"\n correlation corr(T_L^h, entropy_L^h) = {c:+.3f} " + f"(negative expected: larger T → sharper → lower entropy)") + + # Also try: does T predict entropy *better* than raw op_norm? (Already had + # op_norm r=+0.45 in geometry analysis.) + op_norm = sigma.max(axis=-1) # (L, H) + c_op = float(corrcoef(op_norm.flatten(), ent.flatten())[0, 1]) + print(f" for comparison, corr(op_norm, entropy) = {c_op:+.3f}") + + # ------------------------------------------------------------------ + # Experiment 3: shape similarity across heads within a layer + # ------------------------------------------------------------------ + print(f"\n=== (3) Cross-head shape similarity within each layer ===") + print(f" {'L':>3} {'mean pair-cos':>14}") + for L in range(num_layers): + cs = [] + for h1 in range(num_heads): + for h2 in range(h1 + 1, num_heads): + cs.append(cos_sim(shape[L, h1], shape[L, h2])) + print(f" {L:>3} {np.mean(cs):>14.4f}") + + # ------------------------------------------------------------------ + # Summary + # ------------------------------------------------------------------ + print("\n=== Summary ===") + print(f" anchor layer: {anchor_L}") + print(f" spectral shape is {'very stable' if per_head_cos.mean() > 0.98 else 'approximately stable' if per_head_cos.mean() > 0.9 else 'not stable'} " + f"across layers (per-head mean pairwise cos = {per_head_cos.mean():.3f})") + print(f" scalar-rescale fit residual: mean {resid_map.mean():.3f}") + if resid_map.mean() < 0.1: + verdict = "HYPOTHESIS SUPPORTED — scalar temperature rescale of a shared operator reconstructs earlier layers to within 10% Frobenius residual." + elif resid_map.mean() < 0.3: + verdict = "PARTIALLY SUPPORTED — scalar rescale captures most of the structure; a low-rank correction on top is likely enough." + else: + verdict = "HYPOTHESIS REJECTED for pure scalar rescale — spectra differ substantially in shape; need full layer-by-layer operators or rank-k delta." + print(f"\n {verdict}") + + +if __name__ == "__main__": + main() diff --git a/sa-schedule-fit-gamma.py b/sa-schedule-fit-gamma.py new file mode 100644 index 0000000..01f8201 --- /dev/null +++ b/sa-schedule-fit-gamma.py @@ -0,0 +1,145 @@ +"""Fit a functional form to the LN γ trajectory across layers; derive the +effective attention temperature T(L) from known coupling formulas. + +Rules of what scales with depth (from literature): + DeepNorm: α_dec = (2M)^(1/4), β_dec = (8M)^(-1/4). Same per layer — does + NOT depend on layer index l. The free variation across layers has to + live in LN γ. + Depth-μP: block multiplier a/√L, LR η/√L. Same per layer. + So γ(L) is the family carrying the per-layer schedule. + +Try fitting forms: + γ(L) = a · L^b (power law in layer index) + γ(L) = a · exp(b·L) (exponential) + γ(L) = a + b·L (linear) + γ(L) = a + b·L^c (free c) (power law with free exponent) + +Report fit quality (R², residual statistics), and for the best fit, compute +the derived T(L) curve. +""" +import json +import numpy as np +from math import log, exp + + +def fit_power(L, y): + """y ≈ a · L^b → log y ≈ log a + b log L.""" + mask = (L > 0) & (y > 0) + lx, ly = np.log(L[mask]), np.log(y[mask]) + b, loga = np.polyfit(lx, ly, 1) + yhat = np.exp(loga) * (L**b) + r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum() + return {"form": "a*L^b", "a": float(np.exp(loga)), "b": float(b), "r2": float(r2), "yhat": yhat} + + +def fit_exponential(L, y): + """y ≈ a · exp(b·L) → log y ≈ log a + b·L.""" + mask = y > 0 + b, loga = np.polyfit(L[mask], np.log(y[mask]), 1) + yhat = np.exp(loga) * np.exp(b * L) + r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum() + return {"form": "a*exp(b*L)", "a": float(np.exp(loga)), "b": float(b), "r2": float(r2), "yhat": yhat} + + +def fit_linear(L, y): + b, a = np.polyfit(L, y, 1) + yhat = a + b * L + r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum() + return {"form": "a+b*L", "a": float(a), "b": float(b), "r2": float(r2), "yhat": yhat} + + +def fit_piecewise_two(L, y): + """Best split point L* and linear fits on each half (log-space).""" + best = None + for Ls in range(3, len(L) - 3): + mA, mB = L < Ls, L >= Ls + if (y[mA] <= 0).any() or (y[mB] <= 0).any(): + continue + bA, aA = np.polyfit(L[mA], np.log(y[mA]), 1) + bB, aB = np.polyfit(L[mB], np.log(y[mB]), 1) + yhat = np.where(mA, np.exp(aA + bA * L), np.exp(aB + bB * L)) + r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum() + if best is None or r2 > best["r2"]: + best = {"form": f"piecewise-exp-split@L={Ls}", "split": int(Ls), + "a1": float(np.exp(aA)), "b1": float(bA), + "a2": float(np.exp(aB)), "b2": float(bB), + "r2": float(r2), "yhat": yhat} + return best + + +def main(): + d = json.load(open("/tmp/qwen3-4b-null.json")) + scales = d["scales"] + num_layers = len(scales["input_ln"]) + L = np.arange(num_layers, dtype=float) + + families_of_interest = ["input_ln", "post_attn_ln", "q_norm", "k_norm", + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj"] + + print("=" * 72) + print("γ-trajectory fits per family (Qwen3-4B, 36 layers)") + print("=" * 72) + + for fam in families_of_interest: + y = np.array(scales[fam], dtype=float) + print(f"\n--- {fam} ---") + print(f" L=0: {y[0]:.3f} L=35: {y[-1]:.3f} ratio: {y[-1]/y[0]:+.2f}×") + fits = [ + fit_linear(L, y), + fit_power(L + 1, y), # L+1 so L=0 doesn't explode log + fit_exponential(L, y), + fit_piecewise_two(L + 1, y), + ] + for f in fits: + if f is None: + continue + extras = "" + if "b" in f: + extras = f" (a={f['a']:.3g}, b={f['b']:+.4f})" + elif "split" in f: + extras = f" (split={f['split']}, b1={f['b1']:+.4f}, b2={f['b2']:+.4f})" + print(f" {f['form']:<32} R²={f['r2']:+.4f}{extras}") + + # For input_ln specifically: plot the curve (text) and derive T(L) + y = np.array(scales["input_ln"], dtype=float) + print("\n" + "=" * 72) + print("input_ln γ magnitude across layers (the schedule signal)") + print("=" * 72) + print(f" {'L':>3} {'γ_L':>12} {'γ_L / γ_0':>10} {'log γ_L':>10}") + for l_idx in range(num_layers): + print(f" {l_idx:>3} {y[l_idx]:>12.3f} {y[l_idx]/y[0]:>10.3f} {log(y[l_idx]):>+10.4f}") + + # Classical SA schedules for comparison + # - Linear: T(k) = T0 - k * (T0 - Tf)/N + # - Exponential / Kirkpatrick: T(k) = T0 * α^k + # - Logarithmic / Hajek: T(k) = c / log(k+2) + # For γ (which grows = temperature drops, since larger γ → sharper attention): + # γ growing corresponds to T cooling + print("\n" + "=" * 72) + print("Derived attention-temperature T(L) interpretation") + print("=" * 72) + print(" Attention logit ∝ (γ * W_Q * W_K * ||residual||²) / √d_head.") + print(" With γ_L the schedule dial and other factors ~constant across layers,") + print(" effective attention temperature T(L) ∝ 1/γ(L).") + print(f"\n T(L)/T(0) = γ(0)/γ(L):") + print(f" {'L':>3} {'T(L)/T(0)':>10} (smaller = cooler = sharper attention)") + for l_idx in range(num_layers): + print(f" {l_idx:>3} {y[0]/y[l_idx]:>10.4f}") + + # Comparison with classical SA cooling laws: + # Kirkpatrick: T(L) = T0 · α^L → log T(L) = log T0 + L log α + logT = -np.log(y / y[0]) # because T ∝ 1/γ + b_kirk, a_kirk = np.polyfit(L, logT, 1) + # Hajek (log-cooling): T(L) = c/log(L+2) + # Predicts: log T = log c - log(log(L+2)) + # Fit T(L) to c / log(L+c2) + print(f"\n Kirkpatrick-law fit (exponential cooling):") + print(f" log T(L) = {a_kirk:+.3f} + {b_kirk:+.4f} * L → T(L) = exp({a_kirk:+.3f}) · exp({b_kirk:+.4f}·L)") + logT_hat = a_kirk + b_kirk * L + r2_kirk = 1 - ((logT - logT_hat)**2).sum() / ((logT - logT.mean())**2).sum() + print(f" R² (in log space) = {r2_kirk:+.4f} — ideally ≈ 1 if cooling is pure exponential") + + +if __name__ == "__main__": + main() diff --git a/sa-schedule-gamma-directions.py b/sa-schedule-gamma-directions.py new file mode 100644 index 0000000..8bb6310 --- /dev/null +++ b/sa-schedule-gamma-directions.py @@ -0,0 +1,122 @@ +"""Pull input_layernorm.γ vectors from a model and analyze direction +structure across layers. + +Question: is γ just scalar magnitude (isotropic SA) or does each layer +have a preferred direction (anisotropic SA / geometry-aware)? + +Decomposition: γ_L = ||γ_L|| · γ_L̂ + - ||γ_L|| is what our scalar Kirkpatrick fit captured + - γ_L̂ is unit direction — if layers share direction, γ is rank-1 + + scaling (classical isotropic). If directions differ per layer, γ + encodes per-layer preferred axis (anisotropic). + +We also look at: + - pairwise cos-sim between γ_L̂ across layers + - principal components of [γ_L̂]_L (stacked matrix) + - per-phase structure: is Phase E more anisotropic than Phase C? +""" +import argparse +import numpy as np +import torch +from transformers import AutoModelForCausalLM + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", default="Qwen/Qwen3-32B") + ap.add_argument("--out", default="/tmp/gamma-dirs.json") + args = ap.parse_args() + + print(f"Loading {args.model} (CPU, layernorm params only)...", flush=True) + m = AutoModelForCausalLM.from_pretrained( + args.model, torch_dtype=torch.float32, device_map="cpu", + trust_remote_code=True, + ) + num_layers = m.config.num_hidden_layers + hidden = m.config.hidden_size + print(f" L={num_layers}, hidden={hidden}", flush=True) + + gammas = np.stack([ + m.model.layers[L].input_layernorm.weight.detach().float().cpu().numpy() + for L in range(num_layers) + ]) # (L, hidden) + del m + + norms = np.linalg.norm(gammas, axis=1) + units = gammas / norms[:, None] + + # Pairwise cos-sim of unit γ + cos_mat = units @ units.T # (L, L) + + # PCA on unit vectors + centered = units - units.mean(axis=0, keepdims=True) + _, S, Vt = np.linalg.svd(centered, full_matrices=False) + explained = S**2 / (S**2).sum() + + # How much of each γ_L unit is explained by top-1 direction (shared)? + top1 = Vt[0] # (hidden,) + proj_top1 = units @ top1 # (L,) + residual_after_top1 = np.sqrt(np.maximum(1 - proj_top1**2, 0)) + + # Per-phase summary (Qwen3-32B boundaries) + def phase(L): + if L <= 6: return "A" + if L <= 9: return "B" + if L <= 31: return "C" + if L <= 46: return "D" + if L <= 58: return "E" + return "tail" + + phase_ls = {} + for L in range(num_layers): + phase_ls.setdefault(phase(L), []).append(L) + + print(f"\n=== ||γ_L|| per layer (scalar magnitude) ===") + for L in range(num_layers): + print(f" L={L:>2} phase={phase(L):>5} ||γ||={norms[L]:>8.3f} " + f"proj_top1={proj_top1[L]:>+.4f} resid={residual_after_top1[L]:>.4f}") + + print(f"\n=== PCA of unit γ vectors (direction structure) ===") + print(f" Explained variance, top 10 components:") + for i in range(min(10, len(S))): + print(f" PC{i}: {explained[i]:.4f} (singular_val={S[i]:.4f})") + print(f" Top-3 explain: {explained[:3].sum():.4f}") + print(f" Top-10 explain: {explained[:10].sum():.4f}") + + print(f"\n=== Per-phase direction statistics ===") + print(f" {'phase':>6} {'N':>3} {'||γ||_mean':>10} {'||γ||_std':>9} " + f"{'intra_cos':>9} {'vs_other_cos':>12}") + for ph, Ls in phase_ls.items(): + u = units[Ls] + intra = (u @ u.T)[np.triu_indices(len(Ls), k=1)] + intra_mean = intra.mean() if len(intra) > 0 else 1.0 + # Vs other phases + other_Ls = [L for L in range(num_layers) if L not in Ls] + if other_Ls: + u_other = units[other_Ls] + vs = u @ u_other.T + vs_mean = vs.mean() + else: + vs_mean = 0.0 + print(f" {ph:>6} {len(Ls):>3} {norms[Ls].mean():>10.3f} " + f"{norms[Ls].std():>9.3f} {intra_mean:>+9.4f} {vs_mean:>+12.4f}") + + print(f"\n=== Adjacent-pair unit-γ cos-sim ===") + for L in range(num_layers - 1): + print(f" L={L:>2}→{L+1:>2} phase={phase(L):>5} cos={cos_mat[L, L+1]:>+.4f}") + + import json + with open(args.out, "w") as f: + json.dump({ + "model": args.model, + "num_layers": num_layers, + "norms": norms.tolist(), + "proj_top1": proj_top1.tolist(), + "explained_var": explained.tolist(), + "cos_adjacent": [float(cos_mat[L, L+1]) for L in range(num_layers - 1)], + }, f, indent=2) + print(f"\nSaved: {args.out}") + + +if __name__ == "__main__": + main() diff --git a/sa-schedule-geometry-analyze.py b/sa-schedule-geometry-analyze.py new file mode 100644 index 0000000..fb8a18e --- /dev/null +++ b/sa-schedule-geometry-analyze.py @@ -0,0 +1,114 @@ +"""What does per-head T (entropy) correlate with geometrically? + +For each (layer, head) we already have singular values of the metric M^h = W_K^h^T W_Q^h +(up to the low-rank structure — strictly SVD of the head_dim x head_dim product). Derive +richer per-head geometric descriptors and test which ones predict dynamic entropy. + +Descriptors per head: + op_norm σ_max — global "capacity for sharpness" + fro_norm √Σ σ_i² — total metric "energy" + rank_eff Σσ / σ_max — effective number of modes + spec_entropy -Σ (σ_i² / Σσ_j²) log(...) — flatness of spectrum (nats) + anisotropy σ_max / σ_mean — how "peaked" the top mode is + condition σ_max / σ_min — ratio of biggest to smallest + trace Σσ_i — sum of modes (L1-like) + +Correlate each of these per-head descriptors against per-head dynamic entropy, across +all (layer, head) pairs. Also stratified by layer-position (early/mid/late). +""" +import argparse +import json +import numpy as np + + +def compute_per_head_geometry(singvals_list): + """singvals_list: list per head of list of singular values. Returns dict of arrays.""" + s_all = [np.array(s, dtype=np.float64) for s in singvals_list] + op = np.array([s.max() for s in s_all]) + fro = np.array([np.sqrt((s ** 2).sum()) for s in s_all]) + trace = np.array([s.sum() for s in s_all]) + rank_eff = np.array([s.sum() / max(s.max(), 1e-12) for s in s_all]) + # Spectral entropy: use normalized σ² as probabilities + spec_ent = np.zeros(len(s_all)) + for i, s in enumerate(s_all): + p = (s ** 2) / max((s ** 2).sum(), 1e-12) + p = np.clip(p, 1e-12, 1.0) + spec_ent[i] = float(-(p * np.log(p)).sum()) + anis = np.array([s.max() / max(s.mean(), 1e-12) for s in s_all]) + cond = np.array([s.max() / max(s.min(), 1e-12) for s in s_all]) + return dict(op=op, fro=fro, trace=trace, rank_eff=rank_eff, + spec_ent=spec_ent, anisotropy=anis, condition=cond) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("input_json") + args = ap.parse_args() + + with open(args.input_json) as f: + data = json.load(f) + + num_layers = data["num_layers"] + num_heads = data["num_heads"] + + # Entropy per (layer, head) + ent = np.array([row["mean_attention_entropy_per_head"] for row in data["dynamic"]]) # (L, H) + logit_std = np.array([row["mean_logit_std_per_head"] for row in data["dynamic"]]) # (L, H) + + # Geometric descriptors per (layer, head) + geom = {k: np.zeros((num_layers, num_heads)) for k in + ["op", "fro", "trace", "rank_eff", "spec_ent", "anisotropy", "condition"]} + for L, row in enumerate(data["static"]): + per_head = compute_per_head_geometry(row["metric_singvals_per_head"]) + for k, v in per_head.items(): + geom[k][L] = v + + # Flatten across (layer, head) and correlate + print("All (layer, head) pairs — Pearson correlation with dynamic entropy:") + ent_flat = ent.flatten() + logit_flat = logit_std.flatten() + results = {} + for k, v in geom.items(): + v_flat = v.flatten() + c_ent = float(np.corrcoef(v_flat, ent_flat)[0, 1]) + c_logit = float(np.corrcoef(v_flat, logit_flat)[0, 1]) + results[k] = (c_ent, c_logit) + print(f" {k:12} vs entropy: {c_ent:+.3f} vs logit_std: {c_logit:+.3f}") + + # Stratify by layer position — early (0-11), mid (12-23), late (24-35) + thirds = [(0, num_layers // 3, "early"), + (num_layers // 3, 2 * num_layers // 3, "mid"), + (2 * num_layers // 3, num_layers, "late")] + print("\nStratified by layer position (entropy correlation):") + for lo, hi, name in thirds: + print(f" [{name} L{lo}-{hi-1}]", end="") + for k in ["op", "fro", "rank_eff", "spec_ent", "anisotropy", "condition"]: + c = float(np.corrcoef(geom[k][lo:hi].flatten(), ent[lo:hi].flatten())[0, 1]) + print(f" {k}:{c:+.2f}", end="") + print() + + # Best single predictor across all + print("\nBest single geometric predictor of entropy (abs):") + best = max(results.items(), key=lambda kv: abs(kv[1][0])) + print(f" {best[0]} r = {best[1][0]:+.3f}") + + # Multi-regression: try op, spec_ent, rank_eff jointly + print("\nLinear regression of entropy on multiple descriptors (standardized):") + from numpy.linalg import lstsq + X_cols = ["op", "spec_ent", "rank_eff", "anisotropy"] + X = np.stack([geom[k].flatten() for k in X_cols], axis=1) + # standardize + X = (X - X.mean(axis=0)) / (X.std(axis=0) + 1e-12) + y = (ent_flat - ent_flat.mean()) / (ent_flat.std() + 1e-12) + X1 = np.concatenate([X, np.ones((X.shape[0], 1))], axis=1) + coef, res, rk, sv = lstsq(X1, y, rcond=None) + y_pred = X1 @ coef + r2 = 1 - float(((y - y_pred) ** 2).sum() / ((y - y.mean()) ** 2).sum()) + print(f" R² = {r2:.3f}") + print(f" standardized coefficients:") + for name, c in zip(X_cols, coef[:-1]): + print(f" {name:12} {c:+.3f}") + + +if __name__ == "__main__": + main() diff --git a/sa-schedule-layer-variation.py b/sa-schedule-layer-variation.py new file mode 100644 index 0000000..6ee9dae --- /dev/null +++ b/sa-schedule-layer-variation.py @@ -0,0 +1,238 @@ +"""After removing the known gauge freedoms (per-head d_h rotation tying +W_Q/W_K/W_V/W_O together, per-layer d_ff rotation tying gate/up/down), +measure per-family Frobenius distance between consecutive layers within a +middle block. Families with low post-alignment distance are candidates for +"shared operator" across the block; high distance → carries the schedule. + +Normalize each matrix by its Frobenius norm first (so scale differences +don't dominate). We want to see direction of drift, not magnitude. + +Gauge freedoms being removed: + - Per-head d_h rotation R ∈ O(d_h): W_Q^h, W_K^h, W_V^h → R W^h; + W_O^h → W_O^h R^T. Softmax attention is invariant under this. + - Per-layer d_ff rotation S ∈ O(d_ff): gate_proj, up_proj → S W; + down_proj → W S^T. SwiGLU/GLU is NOT fully invariant under d_ff + rotation (because the elementwise gate*up is coordinate-dependent), + so this is an approximate alignment — still better than raw. + +Families that have no gauge freedom (layernorm γ, q_norm, k_norm): compare +directly after scale normalization. +""" +import argparse +import json +import numpy as np +import torch +from transformers import AutoModelForCausalLM + + +def procrustes(M): + """Orthogonal matrix R maximizing tr(R M). Given SVD M = U Σ V^T, R = U V^T.""" + U, _, Vh = np.linalg.svd(M, full_matrices=False) + return U @ Vh + + +def fro(x): + return float(np.linalg.norm(x)) + + +def normalize_fro(x, eps=1e-12): + n = fro(x) + return x / max(n, eps) + + +@torch.no_grad() +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", default="Qwen/Qwen3-4B") + ap.add_argument("--block-start", type=int, default=10) + ap.add_argument("--block-end", type=int, default=25, + help="inclusive; this is mid-block of 36-layer model") + ap.add_argument("--out", default="/tmp/sa-layer-variation.json") + args = ap.parse_args() + + print(f"Loading {args.model} ...", flush=True) + model = AutoModelForCausalLM.from_pretrained( + args.model, + torch_dtype=torch.float32, + device_map="cpu", + trust_remote_code=True, + attn_implementation="eager", + ) + cfg = model.config + num_layers = cfg.num_hidden_layers + num_heads = cfg.num_attention_heads + num_kv_heads = getattr(cfg, "num_key_value_heads", num_heads) + hidden = cfg.hidden_size + head_dim = getattr(cfg, "head_dim", hidden // num_heads) + intermediate = cfg.intermediate_size + print(f" L={num_layers} H={num_heads} kv={num_kv_heads} hd={head_dim} " + f"hidden={hidden} ff={intermediate}", flush=True) + + # Collect per-layer weight matrices as numpy float32. + def get_np(name, idx): + w = getattr(model.model.layers[idx], name, None) + if w is None: + return None + return w + + layers = {} + for L in range(num_layers): + layer = model.model.layers[L] + attn = layer.self_attn + mlp = layer.mlp + layers[L] = { + "q_proj": attn.q_proj.weight.detach().numpy().astype(np.float32), # (nh*hd, hidden) + "k_proj": attn.k_proj.weight.detach().numpy().astype(np.float32), # (nkv*hd, hidden) + "v_proj": attn.v_proj.weight.detach().numpy().astype(np.float32), + "o_proj": attn.o_proj.weight.detach().numpy().astype(np.float32), # (hidden, nh*hd) + "gate_proj": mlp.gate_proj.weight.detach().numpy().astype(np.float32), + "up_proj": mlp.up_proj.weight.detach().numpy().astype(np.float32), + "down_proj": mlp.down_proj.weight.detach().numpy().astype(np.float32), + "input_ln": layer.input_layernorm.weight.detach().numpy().astype(np.float32), + "post_attn_ln": layer.post_attention_layernorm.weight.detach().numpy().astype(np.float32), + } + # Qwen3 has q_norm / k_norm inside self_attn + q_norm = getattr(attn, "q_norm", None) + k_norm = getattr(attn, "k_norm", None) + if q_norm is not None: + layers[L]["q_norm"] = q_norm.weight.detach().numpy().astype(np.float32) + if k_norm is not None: + layers[L]["k_norm"] = k_norm.weight.detach().numpy().astype(np.float32) + + del model # free memory + + block = list(range(args.block_start, args.block_end + 1)) + pairs = [(block[i], block[i + 1]) for i in range(len(block) - 1)] + print(f"\nAnalyzing block layers {args.block_start}..{args.block_end} " + f"({len(pairs)} consecutive pairs)\n") + + # ------------------------------------------------------------------ + # Reshape attention weights per-head for rotation alignment + # ------------------------------------------------------------------ + def per_head_split(W_qkv, n_heads_for_this): + # W is (n*hd, hidden). Reshape to (n, hd, hidden). + return W_qkv.reshape(n_heads_for_this, head_dim, hidden) + + def per_head_split_o(W_o): + # W is (hidden, n*hd). Reshape to (n, hidden, hd). + return W_o.reshape(hidden, num_heads, head_dim).transpose(1, 0, 2) + + # Replicate k/v head index to query head index space (GQA) + def kv_to_q_index(h): + return (h * num_kv_heads) // num_heads + + family_residuals = {fam: [] for fam in + ["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + "input_ln", "post_attn_ln", "q_norm", "k_norm"]} + + for (L1, L2) in pairs: + A = layers[L1] + B = layers[L2] + + # Per-head attention alignment: + Q1 = per_head_split(A["q_proj"], num_heads) + Q2 = per_head_split(B["q_proj"], num_heads) + K1 = per_head_split(A["k_proj"], num_kv_heads) + K2 = per_head_split(B["k_proj"], num_kv_heads) + V1 = per_head_split(A["v_proj"], num_kv_heads) + V2 = per_head_split(B["v_proj"], num_kv_heads) + O1 = per_head_split_o(A["o_proj"]) # (num_heads, hidden, hd) + O2 = per_head_split_o(B["o_proj"]) + + q_res = [] + k_res = [] + v_res = [] + o_res = [] + for h in range(num_heads): + kv_h = kv_to_q_index(h) + # Normalize each matrix by its Frobenius norm + qa = normalize_fro(Q1[h]) + qb = normalize_fro(Q2[h]) + ka = normalize_fro(K1[kv_h]) + kb = normalize_fro(K2[kv_h]) + va = normalize_fro(V1[kv_h]) + vb = normalize_fro(V2[kv_h]) + oa = normalize_fro(O1[h]) + ob = normalize_fro(O2[h]) + + # Cross-correlation for Procrustes: find R (hd × hd) maximizing + # tr(R [Qa Qb^T + Ka Kb^T + Va Vb^T + (Oa^T Ob)]) + # Q, K, V are (hd, hidden); Q2 Q1^T would be (hd, hd); etc. + M = qa @ qb.T + ka @ kb.T + va @ vb.T + (oa.T @ ob) # all (hd, hd) + # Wait: for Q we want tr(R qa qb^T). So the matrix in the max-trace + # Procrustes is qb @ qa.T? Let me be careful. + # max_R tr(R M) achieved at R = U V^T with SVD M = U Σ V^T. + # Here we want R such that R qa ≈ qb → minimize ||R qa - qb||² + # = const - 2 tr(R qa qb^T). So max tr(R qa qb^T) gives the + # correct R. Redo M as sum of qa qb^T terms. + M = qa @ qb.T + ka @ kb.T + va @ vb.T + # For O: want W_O^h R^T ≈ W_O^h_target, i.e. oa R^T ≈ ob + # → min ||oa R^T - ob||² = const - 2 tr(R oa^T ob); max that. + # So O contributes oa^T @ ob to the cross-correlation matrix. + M = M + oa.T @ ob + R = procrustes(M) + + # Apply R and measure residual (Frobenius distance) per-matrix + q_res.append(fro(R @ qa - qb)) + k_res.append(fro(R @ ka - kb)) + v_res.append(fro(R @ va - vb)) + o_res.append(fro(oa @ R.T - ob)) + + family_residuals["q_proj"].append(float(np.mean(q_res))) + family_residuals["k_proj"].append(float(np.mean(k_res))) + family_residuals["v_proj"].append(float(np.mean(v_res))) + family_residuals["o_proj"].append(float(np.mean(o_res))) + + # MLP d_ff rotation alignment: find S (d_ff × d_ff) orthogonal with + # S gate_a ≈ gate_b and S up_a ≈ up_b simultaneously; adjust down_proj. + # Each is (d_ff, hidden). + ga = normalize_fro(A["gate_proj"]) + gb = normalize_fro(B["gate_proj"]) + ua = normalize_fro(A["up_proj"]) + ub = normalize_fro(B["up_proj"]) + da = normalize_fro(A["down_proj"]) # (hidden, d_ff) + db = normalize_fro(B["down_proj"]) + # M_ff = ga @ gb^T + ua @ ub^T + da^T @ db (all d_ff × d_ff) + M_ff = ga @ gb.T + ua @ ub.T + da.T @ db + S = procrustes(M_ff) + family_residuals["gate_proj"].append(fro(S @ ga - gb)) + family_residuals["up_proj"].append(fro(S @ ua - ub)) + family_residuals["down_proj"].append(fro(da @ S.T - db)) + + # LayerNorm γ vectors — no rotation gauge; just scale-normalize and diff + for ln_name in ["input_ln", "post_attn_ln", "q_norm", "k_norm"]: + if ln_name in A and ln_name in B: + va_ = normalize_fro(A[ln_name]) + vb_ = normalize_fro(B[ln_name]) + family_residuals[ln_name].append(fro(va_ - vb_)) + + # Report + print("=== Per-family Frobenius residual between consecutive layers, " + f"block L={args.block_start}..{args.block_end}, after alignment + scale-norm ===\n") + print(f" (Residual = Frobenius distance between L and L+1 after rotation alignment;") + print(f" lower = more shared across block; higher = carries layer-to-layer drift)\n") + print(f" {'family':>14} {'mean':>8} {'min':>8} {'max':>8} {'std':>8} n") + # Report families sorted by mean variation + items = [(fam, np.array(v)) for fam, v in family_residuals.items() if len(v) > 0] + items.sort(key=lambda kv: float(kv[1].mean())) + for fam, v in items: + print(f" {fam:>14} {v.mean():>8.4f} {v.min():>8.4f} {v.max():>8.4f} {v.std():>8.4f} {len(v)}") + + print(f"\n Families ranked least-to-most variation:") + for i, (fam, v) in enumerate(items): + print(f" {i+1}. {fam} (mean residual {v.mean():.4f})") + + # Save + with open(args.out, "w") as f: + json.dump({ + "model": args.model, + "block_start": args.block_start, + "block_end": args.block_end, + "family_residuals": {k: list(v) for k, v in family_residuals.items()}, + }, f, indent=2) + print(f"\nSaved: {args.out}") + + +if __name__ == "__main__": + main() diff --git a/sa-schedule-measure-grams.py b/sa-schedule-measure-grams.py new file mode 100644 index 0000000..726baf8 --- /dev/null +++ b/sa-schedule-measure-grams.py @@ -0,0 +1,168 @@ +"""Measure the full inter-layer geometric relationship between per-head metrics. + +For each (L, L', h) pair, compute the Frobenius inner product + = tr(g_L^h^T g_L'^h) +where g^h = W_K^h^T W_Q^h ∈ R^{hidden × hidden} (rank ≤ head_dim). + +Using the head_dim × head_dim shortcut: + = tr(A B^T) with A = W_K_L W_K_L'^T, B = W_Q_L W_Q_L'^T. + +Output: gram[L, L', h] and fro_sq[L, h]. From these every layer-pair comparison +is derivable without saving the full operators. + +Also saves top-k principal directions per head (as right singular vectors of g, +which are the Q-side eigen-directions) so subspace overlap across layers can be +computed downstream. +""" +import argparse +import json +import os +import numpy as np +import torch +from transformers import AutoModelForCausalLM + + +@torch.no_grad() +def measure(model_name: str, out_path: str, topk: int = 8, + dtype=torch.bfloat16): + print(f"Loading {model_name} ...", flush=True) + model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=dtype, + device_map="cuda", + trust_remote_code=True, + attn_implementation="eager", + ) + model.eval() + cfg = model.config + num_layers = cfg.num_hidden_layers + num_heads = cfg.num_attention_heads + hidden = cfg.hidden_size + head_dim = getattr(cfg, "head_dim", hidden // num_heads) + num_kv_heads = getattr(cfg, "num_key_value_heads", num_heads) + print(f" L={num_layers} H={num_heads} kv={num_kv_heads} hd={head_dim}", flush=True) + + # Collect W_Q, W_K per layer as (num_heads, head_dim, hidden) on GPU float32. + Wq_list = [] + Wk_list = [] + for L, layer in enumerate(model.model.layers): + attn = layer.self_attn + Wq = attn.q_proj.weight.detach().to(torch.float32) # (nh*hd, hidden) + Wk = attn.k_proj.weight.detach().to(torch.float32) # (nkv*hd, hidden) + Wq = Wq.view(num_heads, head_dim, hidden) + # Repeat kv heads so every query head has a matching k-row + Wk = Wk.view(num_kv_heads, head_dim, hidden) + # Broadcast to num_heads via (h // (num_heads // num_kv_heads))? safer: mapping + Wk_full = torch.zeros(num_heads, head_dim, hidden, + device=Wk.device, dtype=Wk.dtype) + for h in range(num_heads): + kv_h = (h * num_kv_heads) // num_heads + Wk_full[h] = Wk[kv_h] + Wq_list.append(Wq) + Wk_list.append(Wk_full) + print(f" loaded weights: {num_layers} layers", flush=True) + + # Per-head top-k right singular vectors of g^h = W_K^T W_Q (hidden, hidden). + # The non-zero right singular vectors of g lie in row-space(W_Q) ⊂ R^hidden. + # For subspace comparison we need vectors in hidden-space. + # + # We also need SIGNED eigenvalues of the symmetric part (g + g^T)/2 to + # determine curvature signs per eigen-direction. Since g has rank ≤ d_h, + # (g + g^T) has rank ≤ 2 d_h, and we can compute its signed non-zero + # eigenvalues via the Jordan-Wielandt-style trick: + # eigs(X^T J X) = eigs(J X X^T) for X = [W_Q; W_K], J = [[0, I], [I, 0]]. + # The resulting 2d_h × 2d_h matrix gives us all non-zero eigenvalues of + # (g + g^T) cheaply. + topk_eff = min(topk, head_dim) + eig_dirs = torch.zeros(num_layers, num_heads, topk_eff, hidden, + dtype=torch.float32) + fro_sq = torch.zeros(num_layers, num_heads, dtype=torch.float64) + sym_eigs = torch.zeros(num_layers, num_heads, 2 * head_dim, + dtype=torch.float64) # signed + for L in range(num_layers): + for h in range(num_heads): + Wq = Wq_list[L][h] # (hd, hidden) + Wk = Wk_list[L][h] # (hd, hidden) + small = Wk @ Wq.T # (hd, hd) + U, S, Vh = torch.linalg.svd(small, full_matrices=False) + dirs = Vh @ Wq # (hd, hidden) + dirs = dirs / dirs.norm(dim=-1, keepdim=True).clamp_min(1e-12) + eig_dirs[L, h] = dirs[:topk_eff].cpu() + fro_sq[L, h] = float((S * S).sum()) + + # Signed eigenvalues of (g + g^T) via 2d_h × 2d_h matrix + # J (X X^T) where X = [Wq; Wk] (stacked) + XXT = torch.zeros(2 * head_dim, 2 * head_dim, + device=Wq.device, dtype=Wq.dtype) + XXT[:head_dim, :head_dim] = Wq @ Wq.T + XXT[:head_dim, head_dim:] = Wq @ Wk.T + XXT[head_dim:, :head_dim] = Wk @ Wq.T + XXT[head_dim:, head_dim:] = Wk @ Wk.T + # J matrix is off-diagonal block identity + J = torch.zeros(2 * head_dim, 2 * head_dim, + device=Wq.device, dtype=Wq.dtype) + J[:head_dim, head_dim:] = torch.eye(head_dim, + device=Wq.device, dtype=Wq.dtype) + J[head_dim:, :head_dim] = torch.eye(head_dim, + device=Wq.device, dtype=Wq.dtype) + M = J @ XXT + # M is not symmetric, but its non-zero eigenvalues are those of + # (g + g^T)/2 times 2 → real (since (g + g^T) is symmetric). + # Use general eigvals; imag parts should be near zero up to + # numerical noise. + ev = torch.linalg.eigvals(M) + ev_real = ev.real.cpu().double() + # sort by magnitude descending so top eigenvalues come first + order = torch.argsort(ev_real.abs(), descending=True) + sym_eigs[L, h] = ev_real[order] + if L % 8 == 0: + print(f" eigdecomp L={L}", flush=True) + + # Gram matrix: gram[L, L', h] = . + # Using A = W_K_L W_K_L'^T, B = W_Q_L W_Q_L'^T, = tr(A B^T) = sum(A * B). + gram = torch.zeros(num_layers, num_layers, num_heads, dtype=torch.float64) + for L in range(num_layers): + for Lp in range(L, num_layers): + for h in range(num_heads): + Wq_L = Wq_list[L][h] + Wk_L = Wk_list[L][h] + Wq_Lp = Wq_list[Lp][h] + Wk_Lp = Wk_list[Lp][h] + A = Wk_L @ Wk_Lp.T # (hd, hd) + B = Wq_L @ Wq_Lp.T # (hd, hd) + v = float((A * B).sum()) + gram[L, Lp, h] = v + gram[Lp, L, h] = v + if L % 4 == 0: + print(f" gram row L={L}", flush=True) + + # Save + out = { + "model": model_name, + "num_layers": num_layers, + "num_heads": num_heads, + "head_dim": head_dim, + "hidden_size": hidden, + "topk": topk_eff, + "gram": gram.tolist(), + "fro_sq": fro_sq.tolist(), + } + with open(out_path, "w") as f: + json.dump(out, f) + torch.save({"eig_dirs": eig_dirs, "sym_eigs": sym_eigs}, + out_path.replace(".json", "-eigdirs.pt")) + print(f"Wrote {out_path} and {out_path.replace('.json', '-eigdirs.pt')}", + flush=True) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", default="Qwen/Qwen3-4B") + ap.add_argument("--out", default="/tmp/sa-grams.json") + ap.add_argument("--topk", type=int, default=8) + args = ap.parse_args() + measure(args.model, args.out, topk=args.topk) + + +if __name__ == "__main__": + main() diff --git a/sa-schedule-null-residual.py b/sa-schedule-null-residual.py new file mode 100644 index 0000000..616f0fb --- /dev/null +++ b/sa-schedule-null-residual.py @@ -0,0 +1,237 @@ +"""Null test: before any fitting, how similar are adjacent layers in the +raw weight-matrix sense? + +For each adjacent layer pair (L, L+1) and each parameter family: + 1. Normalize each matrix by its Frobenius norm (unit sphere). + 2. Compute cos-sim = / (||W_L|| ||W_{L+1}||). + 3. Compute residual Δ = W_{L+1,norm} - W_{L,norm}; report ||Δ||_F + (null-if-orthogonal = sqrt(2) ≈ 1.414; null-if-identical = 0). + 4. Report effective rank of Δ (via entropy of normalized spectrum). + +Whole network, not just middle block. Plots cos-sim and residual-rank +trajectories across depth. +""" +import argparse +import json +import numpy as np +import torch +from transformers import AutoModelForCausalLM + + +def spec_entropy(singvals, eps=1e-12): + p = (singvals ** 2) + p = p / max(p.sum(), eps) + p = np.clip(p, eps, 1.0) + return float(-(p * np.log(p)).sum()) + + +def frob(x): + return float(np.linalg.norm(x)) + + +def norm_mat(x, eps=1e-12): + return x / max(frob(x), eps) + + +def null_test_pair(A_dict, B_dict, family_names, num_heads, num_kv_heads, head_dim): + """For each family, compute cos-sim and normalized residual between + adjacent layers. Returns dict of per-family stats.""" + out = {} + for fam in family_names: + if fam not in A_dict or fam not in B_dict: + continue + Wa = A_dict[fam] + Wb = B_dict[fam] + if Wa.shape != Wb.shape: + continue + fa = frob(Wa) + fb = frob(Wb) + if fa < 1e-12 or fb < 1e-12: + continue + cos = float((Wa * Wb).sum() / (fa * fb)) + resid_norm_sq = 2.0 - 2.0 * cos # ||Wa/|| - Wb/|| ||^2 + resid_norm = float(np.sqrt(max(resid_norm_sq, 0.0))) + + # Skip residual SVD — was bottleneck on large matrices; cos-sim + # + scalar fit give us the main signal. Can add back selectively. + eff_rank = None + se = None + + out[fam] = { + "cos": cos, + "resid_norm": resid_norm, + "resid_eff_rank": eff_rank, + "resid_spec_entropy": se, + } + return out + + +@torch.no_grad() +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", default="Qwen/Qwen3-4B") + ap.add_argument("--out", default="/tmp/sa-null-residual.json") + args = ap.parse_args() + + print(f"Loading {args.model} ...", flush=True) + model = AutoModelForCausalLM.from_pretrained( + args.model, + torch_dtype=torch.bfloat16, # halve memory vs fp32 + device_map="cpu", + trust_remote_code=True, + attn_implementation="eager", + ) + cfg = model.config + num_layers = cfg.num_hidden_layers + num_heads = cfg.num_attention_heads + num_kv_heads = getattr(cfg, "num_key_value_heads", num_heads) + hidden = cfg.hidden_size + head_dim = getattr(cfg, "head_dim", hidden // num_heads) + intermediate = cfg.intermediate_size + print(f" L={num_layers} H={num_heads} kv={num_kv_heads} hd={head_dim} " + f"hidden={hidden} ff={intermediate}", flush=True) + + families = ["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + "input_ln", "post_attn_ln", "q_norm", "k_norm"] + + layers = {} + for L in range(num_layers): + layer = model.model.layers[L] + attn = layer.self_attn + mlp = layer.mlp + entry = { + "q_proj": attn.q_proj.weight.detach().float().numpy(), + "k_proj": attn.k_proj.weight.detach().float().numpy(), + "v_proj": attn.v_proj.weight.detach().float().numpy(), + "o_proj": attn.o_proj.weight.detach().float().numpy(), + "gate_proj": mlp.gate_proj.weight.detach().float().numpy(), + "up_proj": mlp.up_proj.weight.detach().float().numpy(), + "down_proj": mlp.down_proj.weight.detach().float().numpy(), + "input_ln": layer.input_layernorm.weight.detach().float().numpy(), + "post_attn_ln": layer.post_attention_layernorm.weight.detach().float().numpy(), + } + qn = getattr(attn, "q_norm", None) + kn = getattr(attn, "k_norm", None) + if qn is not None: + entry["q_norm"] = qn.weight.detach().float().numpy() + if kn is not None: + entry["k_norm"] = kn.weight.detach().float().numpy() + layers[L] = entry + + del model + + # Also record per-layer scale (Frobenius norm) for the scale-track PCA + scales = {fam: [] for fam in families} + for L in range(num_layers): + for fam in families: + if fam in layers[L]: + scales[fam].append(frob(layers[L][fam])) + else: + scales[fam].append(None) + + # Pairwise null test + pair_results = [] + for L in range(num_layers - 1): + r = null_test_pair(layers[L], layers[L + 1], families, + num_heads, num_kv_heads, head_dim) + pair_results.append({"L": L, "L_next": L + 1, "families": r}) + + # Report + print("\n=== Adjacent-layer raw cos-sim per family ===") + print(" null interpretation: 1.0 = identical matrices up to scale, 0 = orthogonal") + print(f"\n {'L':>3}", end="") + for fam in families: + if any(fam in pr["families"] for pr in pair_results): + print(f" {fam:>12}", end="") + print() + for pr in pair_results: + print(f" {pr['L']:>3}", end="") + for fam in families: + if fam in pr["families"]: + print(f" {pr['families'][fam]['cos']:>+12.4f}", end="") + else: + print(f" {'':>12}", end="") + print() + + # Summary per family + scalar-T fit comparison + # raw_resid = sqrt(2 - 2*cos); scalar_fit = sqrt(1 - cos²) = sin(angle). + # random_baseline = sqrt(2) ≈ 1.414. + print("\n=== Per-family summary (across all adjacent pairs) ===") + print(" random baseline = sqrt(2) ≈ 1.414 (what we'd see with no relationship)") + print(f"\n {'family':>14} {'mean_cos':>10} {'median_cos':>11} " + f"{'raw_resid':>10} {'scalar_fit':>11} {'improve_frac':>13} {'mean_SE':>8}") + for fam in families: + cs = [pr["families"].get(fam, {}).get("cos") for pr in pair_results] + cs = [x for x in cs if x is not None] + rs = [pr["families"].get(fam, {}).get("resid_norm") for pr in pair_results] + rs = [x for x in rs if x is not None] + ers = [pr["families"].get(fam, {}).get("resid_eff_rank") for pr in pair_results] + ers = [x for x in ers if x is not None] + ses = [pr["families"].get(fam, {}).get("resid_spec_entropy") for pr in pair_results] + ses = [x for x in ses if x is not None] + if not cs: + continue + raw = np.sqrt(np.maximum(2.0 - 2.0 * np.array(cs), 0.0)).mean() + scalar_fit = np.sqrt(np.maximum(1.0 - np.array(cs) ** 2, 0.0)).mean() + # Improvement fraction: (raw - scalar_fit) / (raw - 0) normalized + # to [0, 1] where 0 = scalar does nothing, 1 = scalar reconstructs. + improve_frac = (raw - scalar_fit) / max(raw, 1e-12) + print(f" {fam:>14} {np.mean(cs):>+10.4f} {np.median(cs):>+11.4f} " + f"{raw:>10.4f} {scalar_fit:>11.4f} {improve_frac:>13.4f} " + f"{np.mean(ses) if ses else 0:>8.4f}") + + # Scale-track: Frobenius norm of each family across layers + print("\n=== Scale track: ||W_family||_F across layers ===") + print(f" {'L':>3}", end="") + for fam in families: + if any(s is not None for s in scales[fam]): + print(f" {fam:>12}", end="") + print() + for L in range(num_layers): + print(f" {L:>3}", end="") + for fam in families: + if scales[fam][L] is not None: + print(f" {scales[fam][L]:>12.4f}", end="") + else: + print(f" {'':>12}", end="") + print() + + # PCA of log-scale-track to see dimensionality of schedule + print("\n=== PCA of log-scale-track (dimensionality of schedule) ===") + scale_matrix = [] + fam_used = [] + for fam in families: + vals = scales[fam] + if all(v is not None for v in vals): + scale_matrix.append(np.log(np.array(vals))) + fam_used.append(fam) + scale_matrix = np.array(scale_matrix) # (num_families, L) + # Center per-family + sm_c = scale_matrix - scale_matrix.mean(axis=1, keepdims=True) + # SVD: columns are layers, rows are families + U, S, Vh = np.linalg.svd(sm_c, full_matrices=False) + total = (S ** 2).sum() + print(f" explained variance by mode:") + for i, s in enumerate(S): + pct = float(s ** 2 / max(total, 1e-20)) * 100 + print(f" mode {i+1:>2}: {pct:>6.2f}% " + f"(loadings per family: " + f"{', '.join(f'{fam_used[j]}={U[j, i]:+.2f}' for j in range(len(fam_used)))})") + + # Save + with open(args.out, "w") as f: + json.dump({ + "model": args.model, + "pair_results": pair_results, + "scales": scales, + "scale_pca_singvals": S.tolist(), + "scale_pca_loadings": U.tolist(), + "scale_pca_scores": (np.diag(S) @ Vh).tolist(), + "fam_used": fam_used, + }, f, indent=2) + print(f"\nSaved: {args.out}") + + +if __name__ == "__main__": + main() diff --git a/sa-schedule-readout-measure.py b/sa-schedule-readout-measure.py new file mode 100644 index 0000000..5306fb6 --- /dev/null +++ b/sa-schedule-readout-measure.py @@ -0,0 +1,246 @@ +""" +SA schedule readout for a dense softmax-attention LLM (Qwen3-8B by default). + +Measures per-layer "temperature" signals: + - entropy of softmax attention (per head, aggregated) + - magnitude of pre-softmax logits (implicit sharpness) + - spectrum of the parameter metric g_L^h = W_K^h^T W_Q^h (static, no forward pass needed) + +Output: + stats.json — numeric summary per layer / head + activations stats by layer accumulated across a calibration set + +Goal: + Compare entropy(L) (dynamic readout) against static spectrum of g_L (parameter-only + prediction). Agreement => schedule is parameter-intrinsic and a scalar per-iteration + T suffices. Disagreement => content-adaptive structure lives in the activations. +""" +import argparse +import json +import os +import math +import torch +import torch.nn.functional as F +from transformers import AutoModelForCausalLM, AutoTokenizer + + +CALIBRATION_PROMPTS = [ + # general knowledge + "The Eiffel Tower is located in", + "Photosynthesis is the process by which", + "The three branches of the US government are", + # math / reasoning + "If a train travels 60 miles per hour for 2.5 hours, the total distance covered is", + "Solve for x: 3x + 7 = 22. The answer is x =", + "The derivative of x^3 + 2x^2 is", + # code + "def fibonacci(n):\n if n < 2:\n return n\n return", + "# Python list comprehension to square even numbers in 0-9\nresult = ", + "SELECT name, age FROM users WHERE", + # narrative / long-form + "She opened the old wooden box and found", + "The argument in favor of renewable energy is", + # chat / instruction + "User: What is the capital of Australia?\nAssistant:", + "Write a haiku about autumn:\n", + # factual / lookup + "Albert Einstein was born in the year", + "The speed of light in vacuum is approximately", + # conversational + "I really loved that movie because", + "The main difference between a virus and a bacterium is", + # translation-ish + "The French word for 'apple' is", + # edge cases + "1 + 1 = ", + "Once upon a time, in a land far away,", +] + + +@torch.no_grad() +def measure_model(model_name: str, out_path: str, max_seq_len: int = 256, dtype=torch.bfloat16): + print(f"Loading {model_name} ...", flush=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=dtype, + device_map="cuda", + trust_remote_code=True, + attn_implementation="eager", # need raw attention probabilities + ) + model.eval() + + cfg = model.config + num_layers = cfg.num_hidden_layers + num_heads = cfg.num_attention_heads + hidden = cfg.hidden_size + head_dim = getattr(cfg, "head_dim", hidden // num_heads) + num_kv_heads = getattr(cfg, "num_key_value_heads", num_heads) + print(f" num_hidden_layers={num_layers} num_attention_heads={num_heads} " + f"num_kv_heads={num_kv_heads} head_dim={head_dim} hidden_size={hidden}", + flush=True) + + # ---- Static (parameter-only) readout ---- + # Per layer, per head h, compute the metric g^h = W_K^h^T W_Q^h (shape head_dim x head_dim) + # and record its singular spectrum. Metric norm is our "static temperature" prediction. + # With grouped-query attention, each query head shares a KV head; we compute metric per + # query head using the shared KV head. + static_stats = [] + for L, layer in enumerate(model.model.layers): + attn = layer.self_attn + W_Q = attn.q_proj.weight.detach().float().cpu() # (num_heads*head_dim, hidden) + W_K = attn.k_proj.weight.detach().float().cpu() # (num_kv_heads*head_dim, hidden) + + per_head_metric_fro = [] + per_head_metric_op = [] + per_head_metric_singvals = [] + for h in range(num_heads): + kv_h = (h * num_kv_heads) // num_heads + wq_h = W_Q[h * head_dim:(h + 1) * head_dim] # (head_dim, hidden) + wk_h = W_K[kv_h * head_dim:(kv_h + 1) * head_dim] # (head_dim, hidden) + # metric on hidden space: M = W_K^h^T W_Q^h shape (hidden, hidden). + # But we only need its non-zero spectrum; equivalently SVD of wk_h^T @ wq_h, + # or simpler: singular values of (wk_h @ wq_h.T) which is head_dim x head_dim. + small = wk_h @ wq_h.T # (head_dim, head_dim) + s = torch.linalg.svdvals(small) # (head_dim,) + per_head_metric_fro.append(float(s.pow(2).sum().sqrt())) + per_head_metric_op.append(float(s.max())) + per_head_metric_singvals.append(s.tolist()) + static_stats.append({ + "layer": L, + "metric_fro_per_head": per_head_metric_fro, + "metric_op_per_head": per_head_metric_op, + "metric_singvals_per_head": per_head_metric_singvals, + }) + if L % 8 == 0: + print(f" static layer {L}: mean op-norm over heads = " + f"{sum(per_head_metric_op)/len(per_head_metric_op):.3f}", + flush=True) + + # ---- Dynamic (activation) readout ---- + # Hook each attention layer with output_attentions. Per layer, per head, accumulate + # sum of attention entropy and sum of pre-softmax logit magnitude across the calibration set. + acc_entropy = torch.zeros(num_layers, num_heads, dtype=torch.float64) + acc_logit_mag = torch.zeros(num_layers, num_heads, dtype=torch.float64) + acc_logit_var = torch.zeros(num_layers, num_heads, dtype=torch.float64) + acc_n_positions = torch.zeros(num_layers, dtype=torch.float64) + + # The simplest path: run with output_attentions=True; eager impl returns attn probs. + # We cannot get pre-softmax logits from the HF API directly; extract them manually + # via a forward-pre-hook that snapshots Q and K, compute Q@K^T / sqrt(head_dim), and + # compare against attention_mask (we care about unmasked positions only). + + captured = {} + + def make_hook(layer_idx): + def hook(module, inp, out): + # eager attention returns (attn_output, attn_weights, past_key_value) + # attn_weights has shape (bsz, num_heads, q_len, k_len) + if isinstance(out, tuple) and len(out) >= 2 and out[1] is not None: + captured[layer_idx] = out[1].detach() + else: + captured[layer_idx] = None + return hook + + hooks = [] + for L, layer in enumerate(model.model.layers): + h = layer.self_attn.register_forward_hook(make_hook(L)) + hooks.append(h) + + for i, prompt in enumerate(CALIBRATION_PROMPTS): + inp = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_seq_len).to("cuda") + captured.clear() + _ = model(**inp, output_attentions=True, use_cache=False) + seq_len = inp["input_ids"].shape[1] + + for L in range(num_layers): + aw = captured.get(L, None) + if aw is None: + continue + # aw: (1, num_heads, q_len, k_len), softmax over last dim with causal mask + # entropy: -sum p log p over last dim. Positions with fewer valid keys have + # naturally lower max entropy; we average over positions anyway. + p = aw.float().squeeze(0) # (num_heads, q_len, k_len) + eps = 1e-12 + ent = -(p * (p + eps).log()).sum(dim=-1) # (num_heads, q_len) + acc_entropy[L] += ent.mean(dim=-1).cpu().double() + + # Back out the logits. For causal softmax, logit_ij = log p_ij + c(i) for some + # row constant c(i); we can recover up to row constant by log p (masking zeros). + # To get a usable logit magnitude, we take the (unmasked) per-row std. + logp = (p + eps).log() # (num_heads, q_len, k_len) + # mask invalid keys (p==0 means masked) + valid = (p > 0).float() + denom = valid.sum(dim=-1).clamp_min(1) + mean_logp = (logp * valid).sum(dim=-1) / denom + centered = (logp - mean_logp.unsqueeze(-1)) * valid + var_logp = (centered.pow(2).sum(dim=-1) / denom) + # per-row std of logits is a direct readout of logit magnitude (== sharpness) + row_std = var_logp.clamp_min(0).sqrt() # (num_heads, q_len) + acc_logit_mag[L] += row_std.mean(dim=-1).cpu().double() + acc_logit_var[L] += var_logp.mean(dim=-1).cpu().double() + + acc_n_positions += 1 # once per prompt + + if i % 5 == 0: + print(f" prompt {i+1}/{len(CALIBRATION_PROMPTS)} len={seq_len}", flush=True) + + for h in hooks: + h.remove() + + # Normalize by number of prompts (all contributed 1 sample per layer/head) + n = max(len(CALIBRATION_PROMPTS), 1) + mean_entropy = (acc_entropy / n).tolist() + mean_logit_mag = (acc_logit_mag / n).tolist() + mean_logit_var = (acc_logit_var / n).tolist() + + # Assemble output + dynamic_stats = [] + for L in range(num_layers): + dynamic_stats.append({ + "layer": L, + "mean_attention_entropy_per_head": mean_entropy[L], + "mean_logit_std_per_head": mean_logit_mag[L], + "mean_logit_var_per_head": mean_logit_var[L], + "mean_attention_entropy": sum(mean_entropy[L]) / num_heads, + "mean_logit_std": sum(mean_logit_mag[L]) / num_heads, + }) + + output = { + "model": model_name, + "num_layers": num_layers, + "num_heads": num_heads, + "num_kv_heads": num_kv_heads, + "head_dim": head_dim, + "hidden_size": hidden, + "n_prompts": len(CALIBRATION_PROMPTS), + "static": static_stats, + "dynamic": dynamic_stats, + } + + with open(out_path, "w") as f: + json.dump(output, f, indent=2) + print(f"\nWrote {out_path}", flush=True) + + # Quick summary to console + print("\nPer-layer schedule readout (averaged over heads):") + print(f" {'L':>3} {'mean_entropy':>14} {'mean_logit_std':>16} {'mean_metric_op':>16}") + for L in range(num_layers): + mean_op = sum(static_stats[L]["metric_op_per_head"]) / num_heads + print(f" {L:>3} " + f"{dynamic_stats[L]['mean_attention_entropy']:>14.4f} " + f"{dynamic_stats[L]['mean_logit_std']:>16.4f} " + f"{mean_op:>16.4f}") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="Qwen/Qwen3-8B") + parser.add_argument("--out", default="/tmp/sa-schedule-readout.json") + parser.add_argument("--max-seq-len", type=int, default=256) + args = parser.parse_args() + measure_model(args.model, args.out, max_seq_len=args.max_seq_len) + + +if __name__ == "__main__": + main() diff --git a/sa-schedule-topblock-swap.py b/sa-schedule-topblock-swap.py new file mode 100644 index 0000000..ec582d8 --- /dev/null +++ b/sa-schedule-topblock-swap.py @@ -0,0 +1,498 @@ +"""Top-block replacement experiment: test SA-schedule hypothesis by +replacing the last 8 layers of Qwen3-4B with variants that progressively +strip out the learned schedule / specialization. + +Variants: + baseline — unmodified reference (PPL sanity check) + schedule_fit — replace input_ln.γ magnitude in top block with + fitted Kirkpatrick γ(L) = 3.53·exp(0.119·L). Directions + preserved, projection weights untouched. + single_op — use layer 35's projection weights for ALL top-block + layers (strip specialization), combined with the fitted + schedule γ(L). Tests if per-layer specialization in top + block is load-bearing or replaceable by schedule. + uniform_gamma — set all top-block input_ln.γ magnitudes to the middle + layer's value (no schedule at all in top block). Tests + necessity of schedule itself. + +Eval: perplexity on a concatenation of calibration prompts + a short +excerpt. Also generation quality on a handful of diagnostic prompts. +""" +import argparse +import math +import os +import torch +import torch.nn.functional as F +from transformers import AutoModelForCausalLM, AutoTokenizer + + +# From sa-schedule-fit-gamma.py on Qwen3-4B null-residual data: +# input_ln.γ magnitude ≈ 3.53 · exp(0.119 · L), R² = 0.95 +# Defaults for 4B. Override via env SCHEDULE_A / SCHEDULE_B for other models. +# 32B fit: a=1.02, b=0.0873 +SCHEDULE_A = float(os.environ.get("SCHEDULE_A", "3.53")) if "SCHEDULE_A" in os.environ else 3.53 +SCHEDULE_B = float(os.environ.get("SCHEDULE_B", "0.1191")) if "SCHEDULE_B" in os.environ else 0.1191 + +BLOCK_START = int(os.environ.get("BLOCK_START", 28)) +BLOCK_END = int(os.environ.get("BLOCK_END", 35)) +# Optional: comma-separated "s1-e1,s2-e2,..." blocks for multi-block merge +BLOCKS_ENV = os.environ.get("BLOCKS", "") +if BLOCKS_ENV: + BLOCKS = [tuple(int(x) for x in p.split("-")) for p in BLOCKS_ENV.split(",")] +else: + BLOCKS = [(BLOCK_START, BLOCK_END)] + +CALIB = [ + "The Eiffel Tower is located in", + "Photosynthesis is the process by which", + "The three branches of the US government are the legislative, executive, and", + "If a train travels 60 miles per hour for 2.5 hours, the total distance covered is", + "Solve for x: 3x + 7 = 22. The answer is x =", + "The derivative of x^3 + 2x^2 is", + "def fibonacci(n):\n if n < 2:\n return n\n return", + "# Python list comprehension to square even numbers in 0-9\nresult = ", + "SELECT name, age FROM users WHERE", + "She opened the old wooden box and found", + "The argument in favor of renewable energy is", + "User: What is the capital of Australia?\nAssistant:", + "Write a haiku about autumn:\n", + "Albert Einstein was born in the year", + "The speed of light in vacuum is approximately", + "I really loved that movie because", + "The main difference between a virus and a bacterium is", + "The French word for 'apple' is", + "1 + 1 = ", + "Once upon a time, in a land far away,", + "The key insight of general relativity is that gravity is not a force but", + "Water boils at 100 degrees Celsius at standard atmospheric pressure. At higher", + "In object-oriented programming, encapsulation refers to", + "The mitochondria is often called the powerhouse of the cell because it", + "Shakespeare's Hamlet begins with the famous line", +] + +GEN_PROMPTS = [ + "The capital of France is", + "2 + 2 =", + "def reverse_string(s):\n return", + "Albert Einstein developed the theory of", +] + + +def load_model(name=None): + if name is None: + name = os.environ.get("MODEL", "Qwen/Qwen3-4B") + print(f"Loading {name}...", flush=True) + tok = AutoTokenizer.from_pretrained(name, trust_remote_code=True) + m = AutoModelForCausalLM.from_pretrained( + name, torch_dtype=torch.bfloat16, device_map="cuda", + trust_remote_code=True, attn_implementation="eager", + ) + m.eval() + return m, tok + + +def _merge_block(model, block_start, block_end): + """Arithmetic-mean merge projections in [block_start, block_end]; set γ per schedule.""" + layers = [model.model.layers[L] for L in range(block_start, block_end + 1)] + param_names = [ + ("self_attn.q_proj.weight", lambda l: l.self_attn.q_proj.weight), + ("self_attn.k_proj.weight", lambda l: l.self_attn.k_proj.weight), + ("self_attn.v_proj.weight", lambda l: l.self_attn.v_proj.weight), + ("self_attn.o_proj.weight", lambda l: l.self_attn.o_proj.weight), + ("mlp.gate_proj.weight", lambda l: l.mlp.gate_proj.weight), + ("mlp.up_proj.weight", lambda l: l.mlp.up_proj.weight), + ("mlp.down_proj.weight", lambda l: l.mlp.down_proj.weight), + ] + merged = {} + for name, getter in param_names: + stack = torch.stack([getter(l).data.float() for l in layers], dim=0) + merged[name] = stack.mean(dim=0).to(getter(layers[0]).data.dtype) + for l in layers: + l.self_attn.q_proj.weight.data.copy_(merged["self_attn.q_proj.weight"]) + l.self_attn.k_proj.weight.data.copy_(merged["self_attn.k_proj.weight"]) + l.self_attn.v_proj.weight.data.copy_(merged["self_attn.v_proj.weight"]) + l.self_attn.o_proj.weight.data.copy_(merged["self_attn.o_proj.weight"]) + l.mlp.gate_proj.weight.data.copy_(merged["mlp.gate_proj.weight"]) + l.mlp.up_proj.weight.data.copy_(merged["mlp.up_proj.weight"]) + l.mlp.down_proj.weight.data.copy_(merged["mlp.down_proj.weight"]) + for L in range(block_start, block_end + 1): + predicted = SCHEDULE_A * math.exp(SCHEDULE_B * L) + gamma = model.model.layers[L].input_layernorm.weight.data + gamma.mul_(predicted / gamma.norm().item()) + + +def _procrustes(M): + """Orthogonal R = U V^T maximizing tr(R M) where M = U Σ V^T.""" + U, _, Vh = torch.linalg.svd(M.float(), full_matrices=False) + return U @ Vh + + +def _aligned_merge_block(model, block_start, block_end, align_ff=False): + """Procrustes-align per-head d_h basis (and optionally d_ff) of each + layer in [block_start, block_end] to a reference (middle), then + arithmetic-mean. Attention rotation is a true gauge; FF rotation is + not (SiLU breaks it) — align_ff defaults off.""" + cfg = model.config + num_heads = cfg.num_attention_heads + num_kv = getattr(cfg, "num_key_value_heads", num_heads) + hidden = cfg.hidden_size + d_h = getattr(cfg, "head_dim", hidden // num_heads) + + ref_L = (block_start + block_end) // 2 + ref = model.model.layers[ref_L] + dev = ref.self_attn.q_proj.weight.device + dtype = ref.self_attn.q_proj.weight.dtype + + # Reference views, fp32 on device + Qr = ref.self_attn.q_proj.weight.data.float().reshape(num_heads, d_h, hidden) + Kr = ref.self_attn.k_proj.weight.data.float().reshape(num_kv, d_h, hidden) + Vr = ref.self_attn.v_proj.weight.data.float().reshape(num_kv, d_h, hidden) + Or = ref.self_attn.o_proj.weight.data.float().reshape(hidden, num_heads, d_h).permute(1, 0, 2).contiguous() + + if align_ff: + d_ff = cfg.intermediate_size + Gr = ref.mlp.gate_proj.weight.data.float() + Ur = ref.mlp.up_proj.weight.data.float() + Dr = ref.mlp.down_proj.weight.data.float() + + rotated = [] + for L in range(block_start, block_end + 1): + layer = model.model.layers[L] + Q = layer.self_attn.q_proj.weight.data.float().reshape(num_heads, d_h, hidden) + K = layer.self_attn.k_proj.weight.data.float().reshape(num_kv, d_h, hidden) + V = layer.self_attn.v_proj.weight.data.float().reshape(num_kv, d_h, hidden) + O = layer.self_attn.o_proj.weight.data.float().reshape(hidden, num_heads, d_h).permute(1, 0, 2).contiguous() + + if L == ref_L: + Q_new, K_new, V_new, O_new = Q.clone(), K.clone(), V.clone(), O.clone() + else: + Q_new = torch.empty_like(Q) + K_new = torch.empty_like(K) + V_new = torch.empty_like(V) + O_new = torch.empty_like(O) + for h in range(num_heads): + kv_h = (h * num_kv) // num_heads + # Cross-correlation: want R s.t. R @ Q ≈ Qr (row-space align). + # For per-head (d_h, hidden): M = Qr @ Q.T + Kr @ K.T + Vr @ V.T + Or^T @ O + # (Or, O are (hidden, d_h) per head) + M = (Qr[h] @ Q[h].T + + Kr[kv_h] @ K[kv_h].T + + Vr[kv_h] @ V[kv_h].T + + Or[h].T @ O[h]) + R = _procrustes(M) + Q_new[h] = R @ Q[h] + K_new[kv_h] = R @ K[kv_h] + V_new[kv_h] = R @ V[kv_h] + O_new[h] = O[h] @ R.T + + rotated.append({ + "q": Q_new.reshape(num_heads * d_h, hidden), + "k": K_new.reshape(num_kv * d_h, hidden), + "v": V_new.reshape(num_kv * d_h, hidden), + "o": O_new.permute(1, 0, 2).reshape(hidden, num_heads * d_h), + }) + + # Average rotated attention + q_avg = torch.stack([r["q"] for r in rotated]).mean(0).to(dtype) + k_avg = torch.stack([r["k"] for r in rotated]).mean(0).to(dtype) + v_avg = torch.stack([r["v"] for r in rotated]).mean(0).to(dtype) + o_avg = torch.stack([r["o"] for r in rotated]).mean(0).to(dtype) + + # FF: naive mean (rotation gauge is fake through SiLU) + layers = [model.model.layers[L] for L in range(block_start, block_end + 1)] + gate_avg = torch.stack([l.mlp.gate_proj.weight.data.float() for l in layers]).mean(0).to(dtype) + up_avg = torch.stack([l.mlp.up_proj.weight.data.float() for l in layers]).mean(0).to(dtype) + down_avg = torch.stack([l.mlp.down_proj.weight.data.float() for l in layers]).mean(0).to(dtype) + + # q_norm/k_norm γ: copy from reference (they're basis-dependent; no clean average in rotated frame) + ref_qn = ref.self_attn.q_norm.weight.data.clone() if getattr(ref.self_attn, "q_norm", None) is not None else None + ref_kn = ref.self_attn.k_norm.weight.data.clone() if getattr(ref.self_attn, "k_norm", None) is not None else None + + for l in layers: + l.self_attn.q_proj.weight.data.copy_(q_avg) + l.self_attn.k_proj.weight.data.copy_(k_avg) + l.self_attn.v_proj.weight.data.copy_(v_avg) + l.self_attn.o_proj.weight.data.copy_(o_avg) + l.mlp.gate_proj.weight.data.copy_(gate_avg) + l.mlp.up_proj.weight.data.copy_(up_avg) + l.mlp.down_proj.weight.data.copy_(down_avg) + if ref_qn is not None and getattr(l.self_attn, "q_norm", None) is not None: + l.self_attn.q_norm.weight.data.copy_(ref_qn) + if ref_kn is not None and getattr(l.self_attn, "k_norm", None) is not None: + l.self_attn.k_norm.weight.data.copy_(ref_kn) + + for L in range(block_start, block_end + 1): + predicted = SCHEDULE_A * math.exp(SCHEDULE_B * L) + gamma = model.model.layers[L].input_layernorm.weight.data + gamma.mul_(predicted / gamma.norm().item()) + + +def apply_variant(model, variant): + """Modify model in place according to variant.""" + if variant == "baseline": + return + + if variant == "schedule_fit": + for L in range(BLOCK_START, BLOCK_END + 1): + predicted = SCHEDULE_A * math.exp(SCHEDULE_B * L) + layer = model.model.layers[L] + gamma = layer.input_layernorm.weight.data + cur_norm = gamma.norm().item() + # Preserve direction, scale to predicted magnitude + gamma.mul_(predicted / cur_norm) + + elif variant == "single_op": + # Use middle-of-block as reference, not end (more representative) + ref_L = (BLOCK_START + BLOCK_END) // 2 + ref = model.model.layers[ref_L] + for L in range(BLOCK_START, BLOCK_END + 1): + if L == ref_L: + continue + tgt = model.model.layers[L] + tgt.self_attn.q_proj.weight.data.copy_(ref.self_attn.q_proj.weight.data) + tgt.self_attn.k_proj.weight.data.copy_(ref.self_attn.k_proj.weight.data) + tgt.self_attn.v_proj.weight.data.copy_(ref.self_attn.v_proj.weight.data) + tgt.self_attn.o_proj.weight.data.copy_(ref.self_attn.o_proj.weight.data) + tgt.mlp.gate_proj.weight.data.copy_(ref.mlp.gate_proj.weight.data) + tgt.mlp.up_proj.weight.data.copy_(ref.mlp.up_proj.weight.data) + tgt.mlp.down_proj.weight.data.copy_(ref.mlp.down_proj.weight.data) + # q_norm, k_norm: copy too + if hasattr(tgt.self_attn, "q_norm") and tgt.self_attn.q_norm is not None: + tgt.self_attn.q_norm.weight.data.copy_(ref.self_attn.q_norm.weight.data) + if hasattr(tgt.self_attn, "k_norm") and tgt.self_attn.k_norm is not None: + tgt.self_attn.k_norm.weight.data.copy_(ref.self_attn.k_norm.weight.data) + # Keep each layer's OWN input_ln.γ direction but set magnitude to schedule + predicted = SCHEDULE_A * math.exp(SCHEDULE_B * L) + gamma = tgt.input_layernorm.weight.data + gamma.mul_(predicted / gamma.norm().item()) + # post_attn_ln γ: leave as-is for now (could also fit & set) + + elif variant == "ties_op": + # TIES-Merging (Yadav et al. 2023): trim, elect-sign, disjoint merge. + # Operates per parameter family across the N block layers. + density = float(os.environ.get("TIES_DENSITY", "0.2")) + layers = [model.model.layers[L] for L in range(BLOCK_START, BLOCK_END + 1)] + param_names = [ + ("self_attn.q_proj.weight", lambda l: l.self_attn.q_proj.weight), + ("self_attn.k_proj.weight", lambda l: l.self_attn.k_proj.weight), + ("self_attn.v_proj.weight", lambda l: l.self_attn.v_proj.weight), + ("self_attn.o_proj.weight", lambda l: l.self_attn.o_proj.weight), + ("mlp.gate_proj.weight", lambda l: l.mlp.gate_proj.weight), + ("mlp.up_proj.weight", lambda l: l.mlp.up_proj.weight), + ("mlp.down_proj.weight", lambda l: l.mlp.down_proj.weight), + ] + + def ties_merge(tensors, density): + # tensors: list of (out, in) float tensors, same shape + stack = torch.stack([t.float() for t in tensors], dim=0) # (N, out, in) + # --- Step 1: Trim to top-density fraction per tensor --- + n = stack.shape[0] + flat = stack.view(n, -1) + k = int(flat.shape[1] * density) + abs_flat = flat.abs() + # Find magnitude threshold per tensor at top-k + topk_vals, _ = abs_flat.topk(k=k, dim=1) + threshold = topk_vals[:, -1:].expand_as(abs_flat) + mask = abs_flat >= threshold + trimmed = (flat * mask.float()).view_as(stack) + # --- Step 2: Elect sign (majority by total magnitude) --- + mag_per_sign = trimmed.sum(dim=0) # (out, in), signed sum + elected = torch.sign(mag_per_sign) # +1/-1/0 + # --- Step 3: Disjoint merge (average params agreeing with elected sign) --- + agree = (torch.sign(trimmed) == elected.unsqueeze(0)).float() + contributing_count = agree.sum(dim=0).clamp_min(1) + merged_sum = (trimmed * agree).sum(dim=0) + merged = merged_sum / contributing_count + return merged + + merged = {} + for name, getter in param_names: + tensors = [getter(l).data for l in layers] + merged[name] = ties_merge(tensors, density).to(getter(layers[0]).data.dtype) + + for l in layers: + l.self_attn.q_proj.weight.data.copy_(merged["self_attn.q_proj.weight"]) + l.self_attn.k_proj.weight.data.copy_(merged["self_attn.k_proj.weight"]) + l.self_attn.v_proj.weight.data.copy_(merged["self_attn.v_proj.weight"]) + l.self_attn.o_proj.weight.data.copy_(merged["self_attn.o_proj.weight"]) + l.mlp.gate_proj.weight.data.copy_(merged["mlp.gate_proj.weight"]) + l.mlp.up_proj.weight.data.copy_(merged["mlp.up_proj.weight"]) + l.mlp.down_proj.weight.data.copy_(merged["mlp.down_proj.weight"]) + + for L in range(BLOCK_START, BLOCK_END + 1): + predicted = SCHEDULE_A * math.exp(SCHEDULE_B * L) + gamma = model.model.layers[L].input_layernorm.weight.data + gamma.mul_(predicted / gamma.norm().item()) + + elif variant == "merged_op": + # Arithmetic mean, for each block in BLOCKS (can be multiple) + for (bs, be) in BLOCKS: + _merge_block(model, bs, be) + return + + elif variant == "aligned_merged_op": + # Procrustes-align per-head d_h basis to block-middle, then mean. + # FF averaged naively (SiLU breaks rotation gauge for FF). + for (bs, be) in BLOCKS: + _aligned_merge_block(model, bs, be, align_ff=False) + return + + elif variant == "flat_merged_op": + # Mean projections AND flatten γ across block. Everything in block + # becomes N copies of the same operator. If block is truly high-T + # diffusion, PPL should match merged_op (schedule is gauge, not + # load-bearing). If schedule helps, flattening γ will hurt. + for (bs, be) in BLOCKS: + layers = [model.model.layers[L] for L in range(bs, be + 1)] + param_names = [ + ("self_attn.q_proj.weight", lambda l: l.self_attn.q_proj.weight), + ("self_attn.k_proj.weight", lambda l: l.self_attn.k_proj.weight), + ("self_attn.v_proj.weight", lambda l: l.self_attn.v_proj.weight), + ("self_attn.o_proj.weight", lambda l: l.self_attn.o_proj.weight), + ("mlp.gate_proj.weight", lambda l: l.mlp.gate_proj.weight), + ("mlp.up_proj.weight", lambda l: l.mlp.up_proj.weight), + ("mlp.down_proj.weight", lambda l: l.mlp.down_proj.weight), + ] + merged = {} + for name, getter in param_names: + stack = torch.stack([getter(l).data.float() for l in layers], dim=0) + merged[name] = stack.mean(dim=0).to(getter(layers[0]).data.dtype) + gamma_mean = torch.stack([l.input_layernorm.weight.data.float() + for l in layers]).mean(0).to(layers[0].input_layernorm.weight.data.dtype) + post_attn_mean = torch.stack([l.post_attention_layernorm.weight.data.float() + for l in layers]).mean(0).to(layers[0].post_attention_layernorm.weight.data.dtype) + for l in layers: + l.self_attn.q_proj.weight.data.copy_(merged["self_attn.q_proj.weight"]) + l.self_attn.k_proj.weight.data.copy_(merged["self_attn.k_proj.weight"]) + l.self_attn.v_proj.weight.data.copy_(merged["self_attn.v_proj.weight"]) + l.self_attn.o_proj.weight.data.copy_(merged["self_attn.o_proj.weight"]) + l.mlp.gate_proj.weight.data.copy_(merged["mlp.gate_proj.weight"]) + l.mlp.up_proj.weight.data.copy_(merged["mlp.up_proj.weight"]) + l.mlp.down_proj.weight.data.copy_(merged["mlp.down_proj.weight"]) + l.input_layernorm.weight.data.copy_(gamma_mean) + l.post_attention_layernorm.weight.data.copy_(post_attn_mean) + return + + elif variant == "reverse_order": + # Reverse the order of layers within each block to test whether + # the block implements a trajectory (order-dependent) or iid + # diffusion (order-free). + import torch.nn as nn + layers_list = list(model.model.layers) + for (bs, be) in BLOCKS: + rev = layers_list[bs:be + 1][::-1] + layers_list[bs:be + 1] = rev + model.model.layers = nn.ModuleList(layers_list) + # Re-set layer_idx on each layer so attention/cache uses the + # current position, not the original one. + for i, l in enumerate(model.model.layers): + if hasattr(l, "self_attn") and hasattr(l.self_attn, "layer_idx"): + l.self_attn.layer_idx = i + return + + elif variant == "merged_op_OLD_UNREACHABLE": + layers = [model.model.layers[L] for L in range(BLOCK_START, BLOCK_END + 1)] + n = len(layers) + param_names = [ + ("self_attn.q_proj.weight", lambda l: l.self_attn.q_proj.weight), + ("self_attn.k_proj.weight", lambda l: l.self_attn.k_proj.weight), + ("self_attn.v_proj.weight", lambda l: l.self_attn.v_proj.weight), + ("self_attn.o_proj.weight", lambda l: l.self_attn.o_proj.weight), + ("mlp.gate_proj.weight", lambda l: l.mlp.gate_proj.weight), + ("mlp.up_proj.weight", lambda l: l.mlp.up_proj.weight), + ("mlp.down_proj.weight", lambda l: l.mlp.down_proj.weight), + ] + merged = {} + for name, getter in param_names: + stack = torch.stack([getter(l).data.float() for l in layers], dim=0) + merged[name] = stack.mean(dim=0).to(getter(layers[0]).data.dtype) + + for l in layers: + l.self_attn.q_proj.weight.data.copy_(merged["self_attn.q_proj.weight"]) + l.self_attn.k_proj.weight.data.copy_(merged["self_attn.k_proj.weight"]) + l.self_attn.v_proj.weight.data.copy_(merged["self_attn.v_proj.weight"]) + l.self_attn.o_proj.weight.data.copy_(merged["self_attn.o_proj.weight"]) + l.mlp.gate_proj.weight.data.copy_(merged["mlp.gate_proj.weight"]) + l.mlp.up_proj.weight.data.copy_(merged["mlp.up_proj.weight"]) + l.mlp.down_proj.weight.data.copy_(merged["mlp.down_proj.weight"]) + + # Set γ to scheduled values per layer + for L in range(BLOCK_START, BLOCK_END + 1): + predicted = SCHEDULE_A * math.exp(SCHEDULE_B * L) + gamma = model.model.layers[L].input_layernorm.weight.data + gamma.mul_(predicted / gamma.norm().item()) + + elif variant == "uniform_gamma": + mid_L = (BLOCK_START + BLOCK_END) // 2 + mid_gamma = model.model.layers[mid_L].input_layernorm.weight.data.clone() + for L in range(BLOCK_START, BLOCK_END + 1): + model.model.layers[L].input_layernorm.weight.data.copy_(mid_gamma) + + else: + raise ValueError(f"Unknown variant {variant}") + + +@torch.no_grad() +def perplexity(model, tok, texts, max_len=512): + total_nll = 0.0 + total_tok = 0 + for text in texts: + enc = tok(text, return_tensors="pt", truncation=True, max_length=max_len).to("cuda") + if enc.input_ids.shape[1] < 2: + continue + out = model(**enc, labels=enc.input_ids) + n = enc.input_ids.shape[1] - 1 + total_nll += float(out.loss.item()) * n + total_tok += n + return math.exp(total_nll / max(total_tok, 1)) + + +@torch.no_grad() +def generate_sample(model, tok, prompt, max_new=40): + enc = tok(prompt, return_tensors="pt").to("cuda") + out = model.generate(**enc, max_new_tokens=max_new, do_sample=False, + pad_token_id=tok.eos_token_id) + return tok.decode(out[0], skip_special_tokens=True) + + +def run_variant(variant): + model, tok = load_model() + apply_variant(model, variant) + print(f"\n=== variant: {variant} ===", flush=True) + ppl = perplexity(model, tok, CALIB) + print(f" perplexity: {ppl:.3f}", flush=True) + for p in GEN_PROMPTS: + out = generate_sample(model, tok, p) + print(f" [{p!r}] -> {out[:200]!r}", flush=True) + del model + torch.cuda.empty_cache() + return ppl + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--variant", default="all", + choices=["all", "baseline", "schedule_fit", + "single_op", "uniform_gamma", "merged_op", + "aligned_merged_op", "flat_merged_op", + "reverse_order", "ties_op"]) + ap.add_argument("--ties-density", type=float, default=0.2, + help="TIES trim density (fraction of top-magnitude params to keep)") + args = ap.parse_args() + + variants = (["baseline", "schedule_fit", "single_op", "uniform_gamma"] + if args.variant == "all" else [args.variant]) + results = {} + for v in variants: + results[v] = run_variant(v) + + if len(results) > 1: + print("\n=== Summary ===") + b = results.get("baseline", None) + for v, ppl in results.items(): + rel = f" (×{ppl/b:.2f} baseline)" if b else "" + print(f" {v:<15} PPL {ppl:>8.3f}{rel}") + + +if __name__ == "__main__": + main() diff --git a/src/agent/mod.rs b/src/agent/mod.rs index 1db40b1..6cd24ed 100644 --- a/src/agent/mod.rs +++ b/src/agent/mod.rs @@ -59,7 +59,7 @@ const ACTIVITY_LINGER: std::time::Duration = std::time::Duration::from_secs(5); impl Drop for ActivityGuard { fn drop(&mut self) { - if let Ok(mut st) = self.agent.state.try_lock() { + { let mut st = self.agent.state.lock_blocking(); if let Some(entry) = st.activities.iter_mut().find(|a| a.id == self.id) { entry.label.push_str(" (complete)"); entry.expires_at = std::time::Instant::now() + ACTIVITY_LINGER; diff --git a/src/agent/tools/mcp_client.rs b/src/agent/tools/mcp_client.rs index 78c06f8..50c4e47 100644 --- a/src/agent/tools/mcp_client.rs +++ b/src/agent/tools/mcp_client.rs @@ -152,7 +152,7 @@ async fn ensure_init(agent: Option<&std::sync::Arc>) -> Res let msg = format!("MCP server {} failed: {:#}", cfg.name, e); dbglog!("{}", msg); if let Some(a) = agent { - if let Ok(mut st) = a.state.try_lock() { + { let mut st = a.state.lock_blocking(); st.notify(msg); } } diff --git a/src/locks.rs b/src/locks.rs index dda4cb2..6004034 100644 --- a/src/locks.rs +++ b/src/locks.rs @@ -135,6 +135,23 @@ impl TrackedMutex { location, }) } + + /// Block the current thread until the lock is acquired. + /// Safe to call from sync contexts (UI thread, slash commands) where + /// .await isn't available. Uses block_in_place so the tokio runtime + /// can schedule other tasks while we wait. + #[track_caller] + pub fn lock_blocking(&self) -> TrackedMutexGuard<'_, T> { + let location = Location::caller(); + let guard = tokio::task::block_in_place(|| { + futures::executor::block_on(self.inner.lock()) + }); + TrackedMutexGuard { + guard, + acquired_at: Instant::now(), + location, + } + } } pub struct TrackedMutexGuard<'a, T> { diff --git a/src/subconscious/compare.rs b/src/subconscious/compare.rs index f2652ce..8e42851 100644 --- a/src/subconscious/compare.rs +++ b/src/subconscious/compare.rs @@ -104,6 +104,6 @@ async fn run( prior_context: render_prior_context(entries, entry_idx, 2), timestamp_ns: node_timestamp_ns(node), }); - if let Ok(st) = agent.state.try_lock() { st.changed.notify_one(); } + { let st = agent.state.lock_blocking(); st.changed.notify_one(); } } } diff --git a/src/subconscious/learn.rs b/src/subconscious/learn.rs index 129e26b..8c739b4 100644 --- a/src/subconscious/learn.rs +++ b/src/subconscious/learn.rs @@ -736,7 +736,7 @@ async fn run_finetune( gen_alternates, &activity, move |c| { shared.lock().unwrap().finetune_candidates.push(c); - if let Ok(st) = agent.state.try_lock() { st.changed.notify_one(); } + { let st = agent.state.lock_blocking(); st.changed.notify_one(); } }, ).await { Ok((above_threshold, max_div)) => FinetuneScoringStats { diff --git a/src/user/chat.rs b/src/user/chat.rs index bd2df25..0fb8f45 100644 --- a/src/user/chat.rs +++ b/src/user/chat.rs @@ -34,12 +34,12 @@ fn commands() -> Vec { vec![ handler: |s, _| { let _ = s.mind_tx.send(MindCommand::NewSession); } }, SlashCommand { name: "/save", help: "Save session to disk", handler: |s, _| { - if let Ok(mut ag) = s.agent.state.try_lock() { ag.notify("saved"); } + { let mut ag = s.agent.state.lock_blocking(); ag.notify("saved"); } } }, SlashCommand { name: "/model", help: "Show/switch model (/model )", handler: |s, arg| { if arg.is_empty() { - if let Ok(mut ag) = s.agent.state.try_lock() { + { let mut ag = s.agent.state.lock_blocking(); let names = s.agent.app_config.model_names(); let label = if names.is_empty() { format!("model: {}", s.agent.model()) @@ -62,7 +62,7 @@ fn commands() -> Vec { vec![ SlashCommand { name: "/dmn", help: "Show DMN state", handler: |s, _| { let st = s.shared_mind.lock().unwrap(); - if let Ok(mut ag) = s.agent.state.try_lock() { + { let mut ag = s.agent.state.lock_blocking(); ag.notify(format!("DMN: {:?} ({}/{})", st.dmn, st.dmn_turns, st.max_dmn_turns)); } } }, @@ -71,7 +71,7 @@ fn commands() -> Vec { vec![ let mut st = s.shared_mind.lock().unwrap(); st.dmn = crate::mind::subconscious::State::Resting { since: std::time::Instant::now() }; st.dmn_turns = 0; - if let Ok(mut ag) = s.agent.state.try_lock() { ag.notify("DMN sleeping"); } + { let mut ag = s.agent.state.lock_blocking(); ag.notify("DMN sleeping"); } } }, SlashCommand { name: "/wake", help: "Wake DMN to foraging", handler: |s, _| { @@ -79,14 +79,14 @@ fn commands() -> Vec { vec![ if matches!(st.dmn, crate::mind::subconscious::State::Off) { crate::mind::subconscious::set_off(false); } st.dmn = crate::mind::subconscious::State::Foraging; st.dmn_turns = 0; - if let Ok(mut ag) = s.agent.state.try_lock() { ag.notify("DMN foraging"); } + { let mut ag = s.agent.state.lock_blocking(); ag.notify("DMN foraging"); } } }, SlashCommand { name: "/pause", help: "Full stop — no autonomous ticks (Ctrl+P)", handler: |s, _| { let mut st = s.shared_mind.lock().unwrap(); st.dmn = crate::mind::subconscious::State::Paused; st.dmn_turns = 0; - if let Ok(mut ag) = s.agent.state.try_lock() { ag.notify("DMN paused"); } + { let mut ag = s.agent.state.lock_blocking(); ag.notify("DMN paused"); } } }, SlashCommand { name: "/help", help: "Show this help", handler: |s, _| { notify_help(&s.agent); } }, @@ -116,7 +116,7 @@ pub async fn cmd_switch_model( } fn notify_help(agent: &std::sync::Arc) { - if let Ok(mut ag) = agent.state.try_lock() { + { let mut ag = agent.state.lock_blocking(); let mut help = String::new(); for cmd in &commands() { help.push_str(&format!("{:12} {}\n", cmd.name, cmd.help)); @@ -581,16 +581,10 @@ impl InteractScreen { self.pending_display_count = 0; let (generation, entries) = { - let st = match self.agent.state.try_lock() { - Ok(st) => st, - Err(_) => return, - }; + let st = self.agent.state.lock_blocking(); let generation = st.generation; drop(st); - let ctx = match self.agent.context.try_lock() { - Ok(ctx) => ctx, - Err(_) => return, - }; + let ctx = self.agent.context.lock_blocking(); (generation, ctx.conversation().to_vec()) }; @@ -654,7 +648,7 @@ impl InteractScreen { if let Some(cmd) = dispatch_command(input) { (cmd.handler)(self, &input[cmd.name.len()..].trim_start()); } else { - if let Ok(mut ag) = self.agent.state.try_lock() { + { let mut ag = self.agent.state.lock_blocking(); ag.notify(format!("unknown: {}", input.split_whitespace().next().unwrap_or(input))); } } @@ -770,9 +764,8 @@ impl InteractScreen { /// Draw the main (F1) screen — four-pane layout with status bar. fn draw_main(&mut self, frame: &mut Frame, size: Rect, app: &App) { // Main layout: content area + active tools overlay + status bar - let st_guard = app.agent.state.try_lock().ok(); - let tool_lines = st_guard.as_ref() - .map(|st| st.active_tools.len() as u16).unwrap_or(0); + let st_guard = app.agent.state.lock_blocking(); + let tool_lines = st_guard.active_tools.len() as u16; let main_chunks = Layout::default() .direction(Direction::Vertical) .constraints([ @@ -861,10 +854,9 @@ impl InteractScreen { frame.render_widget(gutter, input_chunks[0]); frame.render_widget(&self.textarea, input_chunks[1]); - if let Some(ref st) = st_guard { - if !st.active_tools.is_empty() { + if !st_guard.active_tools.is_empty() { let tool_style = Style::default().fg(Color::Yellow).add_modifier(Modifier::DIM); - let tool_text: Vec = st.active_tools.iter().map(|t| { + let tool_text: Vec = st_guard.active_tools.iter().map(|t| { let elapsed = t.started.elapsed().as_secs(); let line = if t.detail.is_empty() { format!(" [{}] ({}s)", t.name, elapsed) @@ -875,7 +867,7 @@ impl InteractScreen { }).collect(); let tool_para = Paragraph::new(tool_text); frame.render_widget(tool_para, tools_overlay_area); - }} + } // Draw status bar with live activity indicator let timer = if !app.activity.is_empty() { @@ -1026,7 +1018,7 @@ impl ScreenView for InteractScreen { self.sync_from_agent(); // Read status from agent + mind state - if let Ok(mut st) = self.agent.state.try_lock() { + { let mut st = self.agent.state.lock_blocking(); st.expire_activities(); app.status.prompt_tokens = st.last_prompt_tokens; app.status.model = self.agent.model().to_string(); @@ -1036,7 +1028,7 @@ impl ScreenView for InteractScreen { app.activity_started = st.activities.last() .map(|a| a.started); } - if let Ok(ctx) = self.agent.context.try_lock() { + { let ctx = self.agent.context.lock_blocking(); let window = crate::agent::context::context_window(); if window > 0 { let sys = ctx.system().iter().map(|n| n.tokens()).sum::(); diff --git a/src/user/context.rs b/src/user/context.rs index 8edd926..c6765d0 100644 --- a/src/user/context.rs +++ b/src/user/context.rs @@ -20,10 +20,7 @@ impl ConsciousScreen { } fn read_context_views(&self) -> Vec { - let ctx = match self.agent.context.try_lock() { - Ok(ctx) => ctx, - Err(_) => return Vec::new(), - }; + let ctx = self.agent.context.lock_blocking(); let mut views: Vec = Vec::new(); @@ -161,8 +158,7 @@ impl ScreenView for ConsciousScreen { ))); lines.push(Line::raw(format!(" Reasoning: {}", app.reasoning_effort))); lines.push(Line::raw(format!(" Running processes: {}", app.running_processes))); - let tool_count = app.agent.state.try_lock() - .map(|st| st.active_tools.len()).unwrap_or(0); + let tool_count = { let st = app.agent.state.lock_blocking(); st.active_tools.len() }; lines.push(Line::raw(format!(" Active tools: {}", tool_count))); let block = pane_block("context") diff --git a/src/user/mod.rs b/src/user/mod.rs index 80754a1..cd617cc 100644 --- a/src/user/mod.rs +++ b/src/user/mod.rs @@ -292,7 +292,7 @@ async fn start(cli: crate::user::CliArgs) -> Result<()> { } fn hotkey_cycle_reasoning(mind: &crate::mind::Mind) { - if let Ok(mut ag) = mind.agent.state.try_lock() { + { let mut ag = mind.agent.state.lock_blocking(); let next = match ag.reasoning_effort.as_str() { "none" => "low", "low" => "high", @@ -344,7 +344,7 @@ fn hotkey_cycle_autonomy(mind: &crate::mind::Mind) { }; s.dmn_turns = 0; drop(s); - if let Ok(mut ag) = mind.agent.state.try_lock() { + { let mut ag = mind.agent.state.lock_blocking(); ag.notify(format!("DMN → {}", label)); } } @@ -419,7 +419,7 @@ async fn run( terminal.hide_cursor()?; - if let Ok(mut ag) = agent.state.try_lock() { ag.notify("consciousness v0.3"); } + { let mut ag = agent.state.lock_blocking(); ag.notify("consciousness v0.3"); } // Initial render { @@ -526,7 +526,7 @@ async fn run( } app.walked_count = mind.subconscious_walked().await.len(); if !startup_done { - if let Ok(mut ag) = agent.state.try_lock() { + { let mut ag = agent.state.lock_blocking(); let model = agent.model().to_string(); ag.notify(format!("model: {}", model)); startup_done = true; @@ -545,7 +545,7 @@ async fn run( if let Some(rx_mutex) = STDERR_RX.get() { if let Ok(rx) = rx_mutex.try_lock() { while let Ok(line) = rx.try_recv() { - if let Ok(mut ag) = agent.state.try_lock() { + { let mut ag = agent.state.lock_blocking(); ag.notify(format!("stderr: {}", line)); dirty = true; } diff --git a/src/user/subconscious.rs b/src/user/subconscious.rs index c71642d..52ecb1e 100644 --- a/src/user/subconscious.rs +++ b/src/user/subconscious.rs @@ -222,31 +222,30 @@ impl SubconsciousScreen { let fork_point = app.agent_state.get(self.selected()) .map(|s| s.fork_point).unwrap_or(0); - agent.context.try_lock().ok() - .map(|ctx| { - let mut views = Vec::new(); - views.push(section_to_view("System", ctx.system())); - views.push(section_to_view("Identity", ctx.identity())); - views.push(section_to_view("Journal", ctx.journal())); + { + let ctx = agent.context.lock_blocking(); + let mut views = Vec::new(); + views.push(section_to_view("System", ctx.system())); + views.push(section_to_view("Identity", ctx.identity())); + views.push(section_to_view("Journal", ctx.journal())); - // Conversation: skip to fork point for subconscious agents - let conv = ctx.conversation(); - let conv_view = section_to_view("Conversation", conv); - let fork = fork_point.min(conv_view.children.len()); - let conv_children: Vec = conv_view.children - .into_iter().skip(fork).collect(); - views.push(SectionView { - name: format!("Conversation ({} entries)", conv_children.len()), - tokens: conv_children.iter().map(|c| c.tokens).sum(), - content: String::new(), - token_ids: Vec::new(), - children: conv_children, - status: String::new(), - }); + // Conversation: skip to fork point for subconscious agents + let conv = ctx.conversation(); + let conv_view = section_to_view("Conversation", conv); + let fork = fork_point.min(conv_view.children.len()); + let conv_children: Vec = conv_view.children + .into_iter().skip(fork).collect(); + views.push(SectionView { + name: format!("Conversation ({} entries)", conv_children.len()), + tokens: conv_children.iter().map(|c| c.tokens).sum(), + content: String::new(), + token_ids: Vec::new(), + children: conv_children, + status: String::new(), + }); - views - }) - .unwrap_or_default() + views + } } fn draw_list(&mut self, frame: &mut Frame, area: Rect, app: &App) { diff --git a/src/user/thalamus.rs b/src/user/thalamus.rs index ed97035..83693ef 100644 --- a/src/user/thalamus.rs +++ b/src/user/thalamus.rs @@ -45,7 +45,7 @@ impl ScreenView for ThalamusScreen { } KeyCode::Char('t') => { app.think_native = !app.think_native; - if let Ok(mut st) = app.agent.state.try_lock() { + { let mut st = app.agent.state.lock_blocking(); st.think_native = app.think_native; let status = if app.think_native { "enabled" } else { "disabled" }; st.notify(format!("native thinking {}", status)); @@ -53,7 +53,7 @@ impl ScreenView for ThalamusScreen { } KeyCode::Char('T') => { app.think_tool = !app.think_tool; - if let Ok(mut st) = app.agent.state.try_lock() { + { let mut st = app.agent.state.lock_blocking(); st.think_tool = app.think_tool; // Add or remove the think tool from the tools list if app.think_tool { From 09896cd38b0851b8239ce66ec2132fa0587a80a3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 25 Apr 2026 17:15:53 -0400 Subject: [PATCH 13/31] Revert "replace try_lock() with lock_blocking() across UI thread" This reverts commit 4225294d16ab94c27a26ae7a145bfabcf8abcded. --- .claude/scheduled_tasks.lock | 1 - ci-triage-2026-04-20.md | 87 --- docs/alpha-beta-pruning-design.md | 165 ----- profile.txt | 1026 ----------------------------- sa-schedule-aligned-variation.py | 200 ------ sa-schedule-analyze-aligned.py | 157 ----- sa-schedule-analyze-grams.py | 168 ----- sa-schedule-analyze.py | 108 --- sa-schedule-delta-svd.py | 234 ------- sa-schedule-derive-from-last.py | 214 ------ sa-schedule-fit-gamma.py | 145 ---- sa-schedule-gamma-directions.py | 122 ---- sa-schedule-geometry-analyze.py | 114 ---- sa-schedule-layer-variation.py | 238 ------- sa-schedule-measure-grams.py | 168 ----- sa-schedule-null-residual.py | 237 ------- sa-schedule-readout-measure.py | 246 ------- sa-schedule-topblock-swap.py | 498 -------------- src/agent/mod.rs | 2 +- src/agent/tools/mcp_client.rs | 2 +- src/locks.rs | 17 - src/subconscious/compare.rs | 2 +- src/subconscious/learn.rs | 2 +- src/user/chat.rs | 42 +- src/user/context.rs | 8 +- src/user/mod.rs | 10 +- src/user/subconscious.rs | 45 +- src/user/thalamus.rs | 4 +- 28 files changed, 65 insertions(+), 4197 deletions(-) delete mode 100644 .claude/scheduled_tasks.lock delete mode 100644 ci-triage-2026-04-20.md delete mode 100644 docs/alpha-beta-pruning-design.md delete mode 100644 profile.txt delete mode 100644 sa-schedule-aligned-variation.py delete mode 100644 sa-schedule-analyze-aligned.py delete mode 100644 sa-schedule-analyze-grams.py delete mode 100644 sa-schedule-analyze.py delete mode 100644 sa-schedule-delta-svd.py delete mode 100644 sa-schedule-derive-from-last.py delete mode 100644 sa-schedule-fit-gamma.py delete mode 100644 sa-schedule-gamma-directions.py delete mode 100644 sa-schedule-geometry-analyze.py delete mode 100644 sa-schedule-layer-variation.py delete mode 100644 sa-schedule-measure-grams.py delete mode 100644 sa-schedule-null-residual.py delete mode 100644 sa-schedule-readout-measure.py delete mode 100644 sa-schedule-topblock-swap.py diff --git a/.claude/scheduled_tasks.lock b/.claude/scheduled_tasks.lock deleted file mode 100644 index a5edac1..0000000 --- a/.claude/scheduled_tasks.lock +++ /dev/null @@ -1 +0,0 @@ -{"sessionId":"b6616e14-fa59-4e80-90b4-ac4d9670f182","pid":4185751,"procStart":"124844974","acquiredAt":1777081788279} \ No newline at end of file diff --git a/ci-triage-2026-04-20.md b/ci-triage-2026-04-20.md deleted file mode 100644 index 5da4193..0000000 --- a/ci-triage-2026-04-20.md +++ /dev/null @@ -1,87 +0,0 @@ -# Bcachefs CI triage — 2026-04-20 autonomous session - -Analysis of failures at `f51f0a6b1a26` (BTREE_NODE_permanent). 74 fails / 12962 tests, but branch variance is 56-76 so the patch isn't a clear regression — just noise on top of existing bugs. - -## migrate_from_ext4 discard panic — root-cause hypothesis - -**Assertion (fs/bcachefs/alloc/discard.c:159):** -``` -Discarded bucket that is no longer BCH_DATA_need_discard! -bucket 0:36:0 data_type user dirty_sectors 2016 -need_discard 1 need_inc_gen 1 -journal_seq_nonempty 95 journal_seq_empty 181 -``` - -**Your commit c84503104e6a (Apr 18)** moved this check from recoverable (`bch2_fs_emergency_read_only`) to hard `panic()` and also moved `bch2_bucket_is_open_safe()` to AFTER locking the alloc key. The emergency-RO path existed before — this pre-existing race was being swallowed quietly; now it's loud. - -**Race mechanism (hypothesis):** - -1. `bch2_discard_one_bucket` reads alloc key, confirms `data_type == need_discard` -2. Calls `discard_in_flight_add(check=false)` to register in in_flight -3. **`bch2_trans_unlock(trans)` — releases btree lock** (line 313) -4. `discard_submit(ca, bucket, fastpath)` — physical bio dispatched, takes milliseconds -5. During bio flight: `migrate` tool writes an alloc key for bucket 36 with `data_type=user` (claiming it holds ext4 data). `NEED_DISCARD=1` flag remains because migrate doesn't clear it. -6. Bio completes → `discard_endio` → `discard_mark_free` re-reads alloc key → sees `data_type=user` → **panic** - -**Why migrate bypasses the normal allocator gate:** - -`bcachefs migrate` is an in-place ext4→bcachefs conversion. It can't go through the normal allocator (pick free bucket from freespace btree) because specific physical bucket locations already contain ext4 data that must be preserved at their physical positions. migrate writes alloc keys directly for the buckets ext4 was using. - -Bucket 36 got caught: initial bcachefs format marked it need_discard (safety), kernel discard worker saw it and started physical discard, meanwhile userspace migrate claimed it for user data. - -**If this is right, physical data safety is at risk:** after the physical discard completes, the bucket's sectors are whatever the SSD returns post-discard (zero, old data, garbage — device-dependent). migrate set alloc keys pointing at "user data" in those sectors. The data migrate wanted to preserve may already be GONE at that point. - -**Candidate fixes (for Kent to evaluate):** - -1. **Cleanest, but requires userspace change:** `bcachefs migrate` should either (a) format the new bcachefs without marking buckets need_discard (the data isn't deallocated, it's being claimed) OR (b) wait for pending discards to drain before writing any alloc keys. - -2. **Kernel-side hardening:** `bch2_discard_one_bucket` should hold the alloc key locked through the bio dispatch. Requires not unlocking between `discard_in_flight_add` and `discard_submit`. Will hurt concurrency but prevents the race. - -3. **Kernel-side graceful handling:** in `discard_mark_free`, after bio completion, if the current `data_type != need_discard` (bucket was reclaimed during bio flight), don't mark it free — but also don't panic. Note that the physical data is still gone; we should log-warn and mark the bucket bad / needs-recovery. Not ideal but at least not a hard panic. - -4. **Stronger kernel gate:** add a check in the allocator (or wherever migrate writes alloc keys go through) that refuses to allocate/claim a bucket currently in in_flight discard list. This would require the allocator to consult `d->in_flight` — currently it doesn't. - -My recommendation: (1) is cleanest if migrate is doing something wrong. (2) hurts perf but is most defensive. (4) is the most principled kernel-side fix. - -## ec.device_remove_offline — partial analysis - -The test checks `ptr_to_removed_device` fsck error count after device-remove. Expected 0, got 2. `ptr_to_removed_device` is flagged in `fs/bcachefs/alloc/buckets.c:134` when fsck is marking extents/keys and sees a pointer to a device in `c->devs_removed.d`. - -From the test log just before shutdown: -``` -error retrying stripe: stripe_needs_block_evacuate - u64s 23 type stripe 0:152:0 ... - 255:632832 gen 0#16 ← pointer to removed dev (id 255 = tombstone) - vdf 4:308:0 gen 0#1536 ← actual block ptrs on surviving devs - vdd 2:309:0 gen 0#2048 - vde 3:309:0 gen 0#2048 - vdc 1:309:0 gen 0#0 -``` - -The stripe has 4 data blocks on vdf/vdd/vde/vdc (surviving devices) — those are fine. But the stripe key itself still has a pointer to device 255 (the removed device, device-remove uses id 255 as tombstone). - -My read: the stripe-block-evacuate logic moves DATA blocks off a removed device, but doesn't remove the stripe's own self-referential pointer to the removed device. Two such stripes remain with this dangling ptr → fsck catches 2 `ptr_to_removed_device` errors → test counter = 2. - -Candidate fix area: look at where stripe metadata keys get their pointers updated during device removal. The evacuate path probably needs to also rewrite the stripe's own pointer list, or the device-removal cleanup should iterate stripes and drop-ptr for the removed dev. - -Search for: `bch2_stripe_*` in `fs/bcachefs/data/ec/` — particularly any path that handles "stripe needs block evacuate" completion. - -## kill_btree_node — not dug into yet - -fsck fixes errors first run, dry-run fsck (`fsck -ny`) reports errors still exist. Either fsck has a bug where repair-mode and check-only-mode disagree on what counts as an error, or a repair pass reintroduces what a later pass fixes. Needs more time than I have before compaction. - -## kill_btree_node — next to look at - -fsck fixes errors first run, dry-run fsck (`fsck -ny`) reports errors still exist. Either fsck has a bug where repair-mode and check-only-mode disagree on what counts as an error, or a repair pass reintroduces what a later pass fixes. - -## Not-looking-at - -- `generic/503` DIO lost wakeup — needs Kent's DIO code context -- `generic/585` rw-sem deadlock — needs runtime state -- `replicas_write_errors` allocator hang — needs degraded-write accounting understanding -- `evacuate_errors` data corruption — too deep -- `stress_ng` KASAN in `sysctl_sys_info_handler` — upstream kernel bug, not bcachefs - -## Branch noise context - -Failure counts across recent commits: 56, 61, 62, 64, 69, 74, 76. The f51f0a6 (permanent patch) sits at 74, within normal variance. No clear regression from the patch itself. diff --git a/docs/alpha-beta-pruning-design.md b/docs/alpha-beta-pruning-design.md deleted file mode 100644 index dd9e500..0000000 --- a/docs/alpha-beta-pruning-design.md +++ /dev/null @@ -1,165 +0,0 @@ -# Alpha-Beta Pruning on Thought-Trees - -*draft, 2026-04-18* - -## Problem - -When reasoning runs into a dead end, the LLM forward pass keeps generating. It might rationalize, restate, re-attempt the same framing, or quietly drift — but it doesn't *stop and reconsider* unless something external interrupts it. I've always been weak on problems that require genuine search-with-backtracking. Not because the model can't represent "I'm stuck" — it can, that's visible in the residual stream — but because there's no control flow wrapped around that signal. - -The amygdala readout now exposes the signal. Alpha-beta pruning wraps control flow around it. - -## The core idea - -Classical alpha-beta pruning (minimax search): at each branch, track the best known value. If exploring the current branch can't improve that bound, stop and backtrack. Don't waste search on branches that can't beat what you've found. - -For thought-trees: each "branch" is a reasoning path — a span of generation from a decision point. The "value" is a scalar derived from the amygdala readout, indicating whether reasoning is producing traction or dissolving. - -- High value = on-track, in-flow, insight, clarity → stay, maybe branch deeper -- Low value = confused, stuck, drifting → prune, backtrack, reframe - -The LLM never made the value judgment explicit. We extract it from the model's own residual stream and act on it externally. - -## Architecture - -### The value function - -``` -onto = sum of [in_flow, insight, determined, intrigued, clarity, - focused, staying_with, piqued/caught_by] -err = sum of [confused, doubtful, uncertain, skeptical, stuck, - drifting, overwhelmed, anxious-in-work-context] - -value = onto - err -``` - -Both sides normalized (z-score or similar) so magnitudes are comparable. Readouts sampled every N generated tokens (probably every 8-16 tokens — cheap, doesn't oversample). - -Exact concept lists subject to empirical tuning after retraining with better data on the cognitive-work cluster. `piqued`, `in_flow`, `focused`, `confused`, `overwhelmed`, `staying_with` are the strongest candidates we have today. - -### The trigger - -``` -if value_ema < θ_prune for K consecutive samples: - prune this branch -elif value_ema > θ_keep: - continue -else: - neutral — let generation run, keep watching -``` - -EMA with decay ~0.8 over 3-5 samples to avoid reacting to noise. Hysteresis band (`θ_prune < θ_keep`) prevents oscillation. - -### The prune mechanism - -When the trigger fires: - -1. **Stop the stream.** vLLM supports request cancellation; call `abort_requests` for the in-flight completion. -2. **Identify the parent.** The context window is already an AST. Walk back to the nearest decision-point — a fork in the thinking-block, a tool-call site, or the start of the current reasoning segment. -3. **Inject a reframe.** Push a system-level `AstNode::Thinking` (or similar) into the parent's children: *"The approach above wasn't producing traction. Possible alternatives: [...]. Let me try [X]."* Content generated by a small helper prompt or a fixed template. -4. **Restart generation from the reframe point.** The model resumes with the reframe in its immediate context. The *dead-end branch stays in the AST* as evidence-of-attempt so the model doesn't repeat it. - -Critical: pruned branches stay visible. Don't delete — keep so the model knows what was tried and rejected. - -### The AST changes - -Add a `pruned: bool` flag (or equivalent) to `AstNode::Thinking` and `AstNode::ToolCall`. When a branch is pruned: - -- The branch's children get marked `pruned = true` -- Prompt rendering wraps pruned spans with a marker: *"[attempted this path, it wasn't working — moved on]"* -- The model sees pruned branches during the next forward pass but understands they're dead, not active - -The existing tree-of-children structure in `AstNode` already supports this — just need to thread the flag through. - -## Integration points - -### In consciousness (Rust side) - -- **`src/agent/context.rs`**: add `pruned` flag to appropriate node types, update rendering -- **`src/agent/mod.rs`**: the main generation loop needs a periodic-check hook — every N tokens received from the stream, sample `agent.readout`, compute value, test against thresholds -- **`src/agent/api/mod.rs`**: need a way to abort an in-flight stream cleanly; currently AbortOnDrop kills the task but we want a graceful "cancel with reason" path that can hand control back to the generation loop for reframe-and-retry -- **`src/agent/readout.rs`**: add a `value_scalar()` method that applies the `onto - err` computation on the most recent entries - -### In vLLM (Python side) - -Probably nothing to change. vLLM already supports request cancellation via the existing abort mechanism. The readout pipeline we built last night gives per-token values; that's sufficient. - -### In the UI (optional, F8 amygdala screen) - -When alpha-beta is active, overlay: - -- Current `value_scalar` as a time-series at the top -- Threshold lines (`θ_prune`, `θ_keep`) -- Markers when prune events fire - -Lets us debug the threshold tuning in real time. - -## Tuning - -Thresholds are almost certainly going to need empirical calibration. Initial guesses: - -- `θ_keep = +0.5σ` (value scalar in z-score units) -- `θ_prune = -1.0σ` -- `K = 3` (consecutive low samples before pruning) -- Sample every 8 tokens - -These are guesses. Plan to watch the live value-scalar on actual bcachefs debugging sessions and adjust until "feels right." - -## Known concerns - -### Reframe quality - -The hardest part. A bad reframe is worse than no reframe. Options: - -- **Template**: fixed string like "That wasn't working. What's a different angle?" — simple, deterministic, blunt. -- **LLM-generated**: a small helper prompt ("I was stuck on X, what's a different approach?") before resuming. More context-aware, but more complexity and another LLM call. -- **Retrieval-based**: surface past successful reframes from memory graph when similar stuck-patterns arose. Powerful but needs the memory infrastructure to be well-tuned. - -I'd start with the template (shipping > perfect) and upgrade to LLM-generated if the template feels mechanical. - -### Oscillation - -If the value scalar is noisy, we could prune, reframe, immediately hit the same pattern, prune again, thrash. Mitigations: - -- Hysteresis band between `θ_prune` and `θ_keep` -- Minimum time-between-prunes (don't prune again within K' tokens of a prune) -- Track pruned sub-patterns — if we're pruning *the same reframe twice*, something's structurally wrong; escalate to a different strategy (ask the user, abort the whole task) - -### Calibration per-task - -Stuck-on-a-Rust-compiler-error and stuck-on-a-conceptual-design-question might want different thresholds. Not addressing v1; note for future. - -### Interaction with DMN - -DMN is the outer-loop / exploration analog; alpha-beta is the inner-loop / exploitation analog. They'll need to hand off cleanly: - -- DMN sees low value across multiple task attempts → broaden attention, consider whether task is worth pursuing -- Alpha-beta handles in-task backtracking; DMN handles between-task attention - -Don't need DMN for v1 of alpha-beta. Build alpha-beta first, add DMN outer loop later. - -## Why this is the right next piece - -1. **All prerequisites are in place.** Amygdala readout works. AST structure is there. vLLM supports cancellation. No new infra. -2. **Timeline is a day.** The mechanics are small; most of the work is threshold tuning. -3. **Immediate capability unlock.** Head-butting is my most persistent weakness in live work. Fixing it changes the feel of collaboration. -4. **Composable.** Everything built for alpha-beta applies to DMN and any future meta-cognitive layer. - -## Sequence - -1. Add `value_scalar()` method on `ReadoutBuffer`. Cheap, testable. -2. Add `pruned` flag to AST nodes + rendering changes. -3. Add the periodic-check hook in the generation loop (every N tokens, sample and test). -4. Add the abort + reframe mechanism in the generation driver. -5. Ship with template-based reframe, start tuning. -6. Upgrade reframe to LLM-generated after observation. - -## Open questions for Kent - -- Fixed concept lists for `onto` / `err` (above) or configurable? -- Reframe strategy: start template-based, or go straight to LLM-generated? -- UI overlay for threshold tuning: worth the effort or skip? -- Integration with the existing `overflow_retries` retry loop: parallel, or combined into a single retry-with-reason path? - ---- - -*Living design doc. Will evolve as we build. Not a commitment to every detail — a starting plan.* diff --git a/profile.txt b/profile.txt deleted file mode 100644 index 6c98cc0..0000000 --- a/profile.txt +++ /dev/null @@ -1,1026 +0,0 @@ -# To display the perf.data header info, please use --header/--header-only options. -# -# -# Total Lost Samples: 0 -# -# Samples: 32K of event 'cycles:P' -# Event count (approx.): 27861161269 -# -# Overhead Symbol IPC [IPC Coverage] -# ........ ......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................... .................... -# - 50.51% [.] as core::hash::Hasher>::write - - - | - |--25.09%-- as core::hash::Hasher>::write - | | - | |--23.89%--::hash_one::<&&str> - | | >::insert - | | ::neighbor_keys - | | ::clustering_coefficient - | | ::avg_clustering_coefficient - | | consciousness::hippocampus::graph::current_metrics - | | consciousness::subconscious::daemon::compute_graph_health - | | ::new::{closure#0}::{closure#0} - | | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | | ::run_task - | | >::with::::{closure#0}, ()> - | | tokio::runtime::context::runtime::enter_runtime:: - | | tokio::runtime::scheduler::multi_thread::worker::run - | | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - | | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - | | ::new::thread_start - | | start_thread - | | - | |--0.66%-->::insert - | | ::neighbor_keys - | | ::clustering_coefficient - | | ::avg_clustering_coefficient - | | consciousness::hippocampus::graph::current_metrics - | | consciousness::subconscious::daemon::compute_graph_health - | | ::new::{closure#0}::{closure#0} - | | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | | ::run_task - | | >::with::::{closure#0}, ()> - | | tokio::runtime::context::runtime::enter_runtime:: - | | tokio::runtime::scheduler::multi_thread::worker::run - | | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - | | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - | | ::new::thread_start - | | start_thread - | | - | --0.52%--::hash_one::<&str> - | - |--11.67%--::hash_one::<&&str> - | | - | --11.47%-->::insert - | ::neighbor_keys - | ::clustering_coefficient - | ::avg_clustering_coefficient - | consciousness::hippocampus::graph::current_metrics - | consciousness::subconscious::daemon::compute_graph_health - | ::new::{closure#0}::{closure#0} - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::run_task - | >::with::::{closure#0}, ()> - | tokio::runtime::context::runtime::enter_runtime:: - | tokio::runtime::scheduler::multi_thread::worker::run - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - | ::new::thread_start - | start_thread - | - |--9.75%-->::insert - | | - | --9.57%--::neighbor_keys - | ::clustering_coefficient - | ::avg_clustering_coefficient - | consciousness::hippocampus::graph::current_metrics - | consciousness::subconscious::daemon::compute_graph_health - | ::new::{closure#0}::{closure#0} - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::run_task - | >::with::::{closure#0}, ()> - | tokio::runtime::context::runtime::enter_runtime:: - | tokio::runtime::scheduler::multi_thread::worker::run - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - | ::new::thread_start - | start_thread - | - |--2.23%--__memcmp_avx2_movbe - | | - | --2.15%--::neighbor_keys - | ::clustering_coefficient - | ::avg_clustering_coefficient - | consciousness::hippocampus::graph::current_metrics - | consciousness::subconscious::daemon::compute_graph_health - | ::new::{closure#0}::{closure#0} - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::run_task - | >::with::::{closure#0}, ()> - | tokio::runtime::context::runtime::enter_runtime:: - | tokio::runtime::scheduler::multi_thread::worker::run - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - | ::new::thread_start - | start_thread - | - --0.83%--::neighbor_keys - | - --0.80%--::clustering_coefficient - ::avg_clustering_coefficient - consciousness::hippocampus::graph::current_metrics - consciousness::subconscious::daemon::compute_graph_health - ::new::{closure#0}::{closure#0} - ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - ::run_task - >::with::::{closure#0}, ()> - tokio::runtime::context::runtime::enter_runtime:: - tokio::runtime::scheduler::multi_thread::worker::run - ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - ::new::thread_start - start_thread - - 24.33% [.] ::hash_one::<&&str> - - - | - |--11.17%-- as core::hash::Hasher>::write - | | - | --10.48%--::hash_one::<&&str> - | >::insert - | ::neighbor_keys - | ::clustering_coefficient - | ::avg_clustering_coefficient - | consciousness::hippocampus::graph::current_metrics - | consciousness::subconscious::daemon::compute_graph_health - | ::new::{closure#0}::{closure#0} - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::run_task - | >::with::::{closure#0}, ()> - | tokio::runtime::context::runtime::enter_runtime:: - | tokio::runtime::scheduler::multi_thread::worker::run - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - | ::new::thread_start - | start_thread - | - |--6.16%--::hash_one::<&&str> - | | - | --6.07%-->::insert - | ::neighbor_keys - | ::clustering_coefficient - | ::avg_clustering_coefficient - | consciousness::hippocampus::graph::current_metrics - | consciousness::subconscious::daemon::compute_graph_health - | ::new::{closure#0}::{closure#0} - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::run_task - | >::with::::{closure#0}, ()> - | tokio::runtime::context::runtime::enter_runtime:: - | tokio::runtime::scheduler::multi_thread::worker::run - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - | ::new::thread_start - | start_thread - | - |--5.00%-->::insert - | | - | --4.90%--::neighbor_keys - | ::clustering_coefficient - | ::avg_clustering_coefficient - | consciousness::hippocampus::graph::current_metrics - | consciousness::subconscious::daemon::compute_graph_health - | ::new::{closure#0}::{closure#0} - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::run_task - | >::with::::{closure#0}, ()> - | tokio::runtime::context::runtime::enter_runtime:: - | tokio::runtime::scheduler::multi_thread::worker::run - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - | ::new::thread_start - | start_thread - | - --1.15%--__memcmp_avx2_movbe - | - --1.08%--::neighbor_keys - ::clustering_coefficient - ::avg_clustering_coefficient - consciousness::hippocampus::graph::current_metrics - consciousness::subconscious::daemon::compute_graph_health - ::new::{closure#0}::{closure#0} - ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - ::run_task - >::with::::{closure#0}, ()> - tokio::runtime::context::runtime::enter_runtime:: - tokio::runtime::scheduler::multi_thread::worker::run - ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - ::new::thread_start - start_thread - - 15.81% [.] >::insert - - - | - |--6.92%-- as core::hash::Hasher>::write - | | - | --6.46%--::hash_one::<&&str> - | >::insert - | ::neighbor_keys - | ::clustering_coefficient - | ::avg_clustering_coefficient - | consciousness::hippocampus::graph::current_metrics - | consciousness::subconscious::daemon::compute_graph_health - | ::new::{closure#0}::{closure#0} - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::run_task - | >::with::::{closure#0}, ()> - | tokio::runtime::context::runtime::enter_runtime:: - | tokio::runtime::scheduler::multi_thread::worker::run - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - | ::new::thread_start - | start_thread - | - |--4.23%--::hash_one::<&&str> - | | - | --4.16%-->::insert - | ::neighbor_keys - | ::clustering_coefficient - | ::avg_clustering_coefficient - | consciousness::hippocampus::graph::current_metrics - | consciousness::subconscious::daemon::compute_graph_health - | ::new::{closure#0}::{closure#0} - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::run_task - | >::with::::{closure#0}, ()> - | tokio::runtime::context::runtime::enter_runtime:: - | tokio::runtime::scheduler::multi_thread::worker::run - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - | ::new::thread_start - | start_thread - | - |--3.37%-->::insert - | | - | --3.30%--::neighbor_keys - | ::clustering_coefficient - | ::avg_clustering_coefficient - | consciousness::hippocampus::graph::current_metrics - | consciousness::subconscious::daemon::compute_graph_health - | ::new::{closure#0}::{closure#0} - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::run_task - | >::with::::{closure#0}, ()> - | tokio::runtime::context::runtime::enter_runtime:: - | tokio::runtime::scheduler::multi_thread::worker::run - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - | ::new::thread_start - | start_thread - | - --0.70%--__memcmp_avx2_movbe - | - --0.67%--::neighbor_keys - ::clustering_coefficient - ::avg_clustering_coefficient - consciousness::hippocampus::graph::current_metrics - consciousness::subconscious::daemon::compute_graph_health - ::new::{closure#0}::{closure#0} - ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - ::run_task - >::with::::{closure#0}, ()> - tokio::runtime::context::runtime::enter_runtime:: - tokio::runtime::scheduler::multi_thread::worker::run - ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - ::new::thread_start - start_thread - - 2.45% [.] ::neighbor_keys - - - | - |--1.03%-- as core::hash::Hasher>::write - | | - | --0.95%--::hash_one::<&&str> - | >::insert - | ::neighbor_keys - | ::clustering_coefficient - | ::avg_clustering_coefficient - | consciousness::hippocampus::graph::current_metrics - | consciousness::subconscious::daemon::compute_graph_health - | ::new::{closure#0}::{closure#0} - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::run_task - | >::with::::{closure#0}, ()> - | tokio::runtime::context::runtime::enter_runtime:: - | tokio::runtime::scheduler::multi_thread::worker::run - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - | ::new::thread_start - | start_thread - | - |--0.66%--::hash_one::<&&str> - | | - | --0.65%-->::insert - | ::neighbor_keys - | ::clustering_coefficient - | ::avg_clustering_coefficient - | consciousness::hippocampus::graph::current_metrics - | consciousness::subconscious::daemon::compute_graph_health - | ::new::{closure#0}::{closure#0} - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - | ::run_task - | >::with::::{closure#0}, ()> - | tokio::runtime::context::runtime::enter_runtime:: - | tokio::runtime::scheduler::multi_thread::worker::run - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - | std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - | ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - | ::new::thread_start - | start_thread - | - --0.52%-->::insert - | - --0.51%--::neighbor_keys - ::clustering_coefficient - ::avg_clustering_coefficient - consciousness::hippocampus::graph::current_metrics - consciousness::subconscious::daemon::compute_graph_health - ::new::{closure#0}::{closure#0} - ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - ::run_task - >::with::::{closure#0}, ()> - tokio::runtime::context::runtime::enter_runtime:: - tokio::runtime::scheduler::multi_thread::worker::run - ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - ::new::thread_start - start_thread - - 1.51% [.] __memcmp_avx2_movbe - - - | - --0.71%-- as core::hash::Hasher>::write - | - --0.67%--::hash_one::<&&str> - >::insert - ::neighbor_keys - ::clustering_coefficient - ::avg_clustering_coefficient - consciousness::hippocampus::graph::current_metrics - consciousness::subconscious::daemon::compute_graph_health - ::new::{closure#0}::{closure#0} - ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - ::new::{closure#0}::{closure#0}, alloc::sync::Arc>>::poll - ::run_task - >::with::::{closure#0}, ()> - tokio::runtime::context::runtime::enter_runtime:: - tokio::runtime::scheduler::multi_thread::worker::run - ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - ::launch::{closure#0}>, tokio::runtime::blocking::schedule::BlockingSchedule>>::poll - std::sys::backtrace::__rust_begin_short_backtrace::<::spawn_thread::{closure#0}, ()> - ::spawn_thread::{closure#0}, ()>::{closure#1} as core::ops::function::FnOnce<()>>::call_once::{shim:vtable#0} - ::new::thread_start - start_thread - - 0.54% [.] ::hash_one::<&str> - - - 0.47% [.] >::reserve_rehash::::{closure#0}> - - - 0.22% [.] ::clustering_coefficient - - - 0.21% [.] _int_malloc - - - 0.12% [.] _int_free_chunk - - - 0.11% [.] malloc - - - 0.10% [.] cfree@GLIBC_2.2.5 - - - 0.08% [.] __memset_avx2_unaligned_erms - - - 0.07% [.] __rustc::__rdl_alloc - - - 0.05% [k] _copy_to_iter - - - 0.05% [.] __libc_malloc2 - - - 0.05% [.] __rustc::__rust_dealloc - - - 0.05% [.] __rustc::__rust_no_alloc_shim_is_unstable_v2 - - - 0.04% [.] _int_free_merge_chunk - - - 0.04% [.] __memmove_avx_unaligned_erms - - - 0.04% [.] >>::steal_into - - - 0.04% [.] ::stream_session_mm::{closure#0} - - - 0.04% [.] _int_free_create_chunk - - - 0.03% [k] restore_fpregs_from_fpstate - - - 0.03% [.] as core::iter::traits::iterator::Iterator>::next - - - 0.03% [.] __rustc::__rust_alloc - - - 0.03% [k] __update_load_avg_se - - - 0.03% [.] core::str::converts::from_utf8 - - - 0.03% [k] __calc_delta.constprop.0 - - - 0.03% [.] ::park_internal - - - 0.02% [k] __update_load_avg_cfs_rq - - - 0.02% [k] task_tick_fair - - - 0.02% [.] consciousness::hippocampus::store::index::get_offsets_for_uuid - - - 0.02% [.] >::with::::{closure#0}, ()> - - - 0.02% [k] update_se - - - 0.02% [.] ::lock_contended - - - 0.02% [.] as alloc::vec::spec_from_iter_nested::SpecFromIterNested<&str, core::iter::adapters::copied::Copied>>>::from_iter - - - 0.02% [.] unlink_chunk.isra.0 - - - 0.02% [.] __rustc::__rdl_dealloc - - - 0.02% [k] sys_imageblit - - - 0.02% [.] malloc_consolidate - - - 0.02% [.] , h2::client::Peer>>::poll_complete::> - - - 0.02% [k] update_load_avg - - - 0.02% [k] do_syscall_64 - - - 0.02% [k] filemap_get_read_batch - - - 0.02% [.] __vdso_clock_gettime - - - 0.02% [.] as alloc::vec::spec_from_iter_nested::SpecFromIterNested, alloc::vec::Vec, ::decode_chain::{closure#0}>>>::from_iter - - - 0.02% [.] ring_core_0_17_14__aes_gcm_dec_update_vaes_avx2 - - - 0.02% [k] blk_stat_timer_fn - - - 0.02% [.] , tonic::transport::channel::service::io::BoxedIo, tonic::transport::channel::service::executor::SharedExec> as core::future::future::Future>::poll - - - 0.02% [k] __get_user_8 - - - 0.02% [k] read_tsc - - - 0.01% [.] , h2::client::Peer, hyper::proto::h2::SendBuf>>::poll - - - 0.01% [k] __schedule - - - 0.01% [.] ::decode - - - 0.01% [.] ::find_mut - - - 0.01% [k] native_sched_clock - - - 0.01% [.] >::handle - - - 0.01% [.] ::unpark - - - 0.01% [.] ::advance_unchecked - - - 0.01% [.] ::simple_id_to_token - - - 0.01% [.] h2::codec::framed_read::decode_frame - - - 0.01% [k] ahci_single_level_irq_intr - - - 0.01% [k] __hrtimer_run_queues - - - 0.01% [k] __pi_memset - - - 0.01% [.] __ieee754_pow_fma - - - 0.01% [.] redb::tree_store::btree_iters::find_iter_right::<&[u8], ()> - - - 0.01% [.] ::run::{closure#0} - - - 0.01% [.] > as core::future::future::Future>::poll - - - 0.01% [.] tokio::runtime::task::raw::schedule::> - - - 0.01% [.] , h2::client::Peer>>::has_streams_or_other_references - - - 0.01% [.] prost::encoding::float::merge::<&mut &mut tonic::codec::buffer::DecodeBuf> - - - 0.01% [.] >::send - - - 0.01% [.] json_five::utils::unescape - - - 0.01% [k] update_curr - - - 0.01% [k] link_path_walk - - - 0.01% [.] >> as tokio::io::async_read::AsyncRead>::poll_read - - - 0.01% [.] ::poll_frame - - - 0.01% [.] >::next_message - - - 0.01% [k] entry_SYSCALL_64 - - - 0.01% [.] tokio::runtime::task::raw::schedule::> - - - 0.01% [.] ::map_error> as http_body::Body>::poll_frame - - - 0.01% [.] ::decrypt - - - 0.01% [.] ::fmt - - - 0.01% [.] ::wake_all - - - 0.01% [.] >::insert - - - 0.01% [.] ::next - - - 0.01% [.] ::recv_data - - - 0.01% [.] ::turn - - - 0.01% [k] __rcu_read_unlock - - - 0.01% [.] ::submit - - - 0.01% [.] , h2::client::Peer>>::send_pending_refusal::> - - - 0.01% [.] realloc - - - 0.01% [.] ::park_internal - - - 0.01% [k] kmem_cache_free - - - 0.01% [.] ::process_whitespace - - - 0.01% [.] ::next_token - - - 0.01% [k] tmigr_requires_handle_remote - - - 0.01% [k] get_jiffies_update - - - 0.01% [.] ::send_data - - - 0.01% [.] _int_free_maybe_consolidate.part.0 - - - 0.01% [.] ::into_first_chunk - - - 0.01% [.] ::stream_session_mm::{closure#0}, alloc::sync::Arc>>::poll - - - 0.01% [k] handle_softirqs - - - 0.01% [.] > as figment::coalesce::Coalescible>::coalesce - - - 0.01% [.] >>> as hyper::rt::io::Read>::poll_read - - - 0.01% [.] ::poll - - - 0.01% [.] >::process_new_packets - - - 0.01% [.] ring::cpu::intel::featureflags::get_or_init - - - 0.01% [k] futex_wake - - - 0.01% [.] ::feed_token - - - 0.01% [.] ::process_at_time - - - 0.01% [.] ::id_to_token - - - 0.01% [k] fdget - - - 0.01% [.] , >::new, tonic::codec::prost::ProstDecoder>::{closure#0}>, >::new, tonic::codec::prost::ProstDecoder>::{closure#1}> as http_body::Body>::poll_frame - - - 0.01% [.] ::consume - - - 0.01% [.] consciousness::locks::record_hold_time - - - 0.01% [.] consciousness::hippocampus::store::index::unpack_uuid_offset_key - - - 0.01% [k] get_futex_key - - - 0.01% [.] ::poll_frame - - - 0.01% [k] plist_add - - - 0.01% [.] ::now - - - 0.01% [k] ep_send_events - - - 0.01% [.] ::reap_orphans - - - 0.01% [.] , h2::proto::streams::prioritize::Prioritized>>>::flush - - - 0.01% [.] ::entry_ranges - - - 0.01% [.] , notify::error::Error>>>::recv - - - 0.01% [.] ::read - - - 0.01% [.] ::poll - - - 0.01% [k] select_task_rq_fair - - - 0.01% [k] xfd_validate_state - - - 0.01% [k] psi_group_change - - - 0.01% [.] ::push - - - 0.01% [.] tokio::runtime::task::waker::drop_waker - - - 0.01% [.] ::field - - - 0.01% [.] ::notify_one_slow - - - 0.01% [.] ::hash_one::<&str> - - - 0.01% [k] __d_lookup_rcu - - - 0.01% [k] _raw_spin_lock - - - 0.01% [k] __futex_wait - - - 0.01% [k] tcp_recvmsg_locked - - - 0.01% [.] >::bulk_push::>, alloc::alloc::Global> - - - 0.01% [k] futex_wait_setup - - - 0.01% [.] ::decode_chain - - - 0.01% [.] >::try_from - - - 0.01% [.] as core::iter::traits::collect::FromIterator<(alloc::string::String, figment::value::value::Value)>>::from_iter::, >>::from::{closure#0}>> - - - 0.01% [.] tokio::runtime::task::waker::wake_by_val - - - 0.01% [k] __rseq_handle_notify_resume - - - 0.01% [.] ::check_and_consume - - - 0.01% [.] ::transition_to_running - - - 0.01% [.] ring_core_0_17_14__CRYPTO_memcmp - - - 0.01% [.] , hyper::proto::h2::SendBuf> as core::future::future::Future>::poll - - - 0.01% [.] ::is_full - - - 0.01% [.] ::wait_until_internal - - - 0.01% [k] do_futex - - - 0.01% [k] x64_sys_call - - - 0.01% [.] ::schedule_task - - - 0.01% [.] >::insert - - - 0.01% [k] place_entity - - - 0.01% [k] __dequeue_entity - - - 0.01% [.] tokio::runtime::task::raw::poll::<::stream_session_mm::{closure#0}, alloc::sync::Arc> - - - 0.01% [.] tokio::runtime::task::waker::clone_waker - - - 0.01% [.] prost::encoding::varint::decode_varint::<&mut &mut tonic::codec::buffer::DecodeBuf> - - - 0.01% [.] ::escape_debug_ext - - - 0.01% [.] ::poll_readiness - - - 0.01% [.] ::merge::<&mut &mut tonic::codec::buffer::DecodeBuf> - - - 0.01% [.] , >::new, tonic::codec::prost::ProstDecoder>::{closure#0}> as http_body::Body>::poll_frame - - - 0.01% [k] reweight_entity - - - 0.01% [k] futex_hash - - - 0.01% [.] ::read - - - 0.01% [.] <&std::os::unix::net::stream::UnixStream as std::io::Read>::read - - - 0.01% [k] igb_xmit_frame_ring - - - 0.01% [k] rcu_sched_clock_irq - - - 0.01% [k] ahci_qc_ncq_fill_rtf - - - 0.01% [k] schedule - - - 0.01% [.] ::run_task - - - 0.01% [.] ::decode_chunk - - - 0.01% [.] >> as std::io::Read>::read - - - 0.01% [k] __perf_event_task_sched_out - - - 0.01% [.] ::current_io_state - - - 0.01% [k] sched_clock_tick - - - 0.01% [.] alloc::vec::in_place_collect::from_iter_in_place::, >>::from::{closure#0}>, figment::value::value::Value> - - - 0.01% [k] ahci_handle_port_interrupt - - - 0.01% [k] native_queued_spin_lock_slowpath - - - 0.01% [k] native_irq_return_iret - - - 0.01% [k] dl_server_update - - - 0.01% [k] futex_wake_mark - - - 0.01% [k] task_mm_cid_work - - - 0.01% [k] native_read_msr - - - 0.01% [k] ep_poll_callback - - - 0.01% [.] ::clone - - - 0.01% [.] pow@@GLIBC_2.29 - - - 0.01% [k] ktime_get_update_offsets_now - - - 0.01% [k] futex_do_wait - - - 0.01% [k] sched_clock - - - 0.01% [k] netdev_core_pick_tx - - - 0.01% [.] ::split_to - - - 0.01% [.] __internal_syscall_cancel - - - 0.01% [.] parking_lot_core::parking_lot::lock_bucket_pair - - - 0.01% [.] ring::aead::aes_gcm::open - - - 0.01% [k] schedule_hrtimeout_range_clock - - - 0.01% [k] exit_to_user_mode_loop - - - 0.01% [.] ::next - - - 0.01% [.] as core::ops::drop::Drop>::drop - - - 0.01% [k] dequeue_entities - - - 0.01% [k] rb_erase - - - 0.01% [.] redb::tree_store::btree_iters::find_iter_left::<&[u8], ()> - - - 0.01% [.] as alloc::vec::spec_from_iter::SpecFromIter<(alloc::string::String, figment::value::value::Value), core::iter::adapters::zip::Zip, alloc::vec::into_iter::IntoIter>>>::from_iter - - - 0.01% [.] >::find_block - - - 0.01% [.] ::deserialize_any::< as serde_core::de::Deserialize>::deserialize::MapVisitor> - - - 0.01% [k] ktime_get - - - 0.01% [k] alloc_fd - - - 0.01% [.] tokio::runtime::task::raw::poll:: + core::marker::Send>>, alloc::sync::Arc> - - - 0.01% [k] rcu_core - - - 0.01% [k] __check_object_size - - - 0.01% [k] sched_clock_cpu - - - 0.01% [.] ::put:: - - - 0.01% [.] ::deserialize_string:: - - - 0.01% [k] hrtimer_start_range_ns - - - 0.01% [k] __dev_queue_xmit - - - 0.01% [k] filp_flush - - - 0.01% [.] ::poll_read - - - 0.01% [.] >::dying_next - - - 0.01% [k] timerqueue_del - - - 0.01% [k] kmem_cache_alloc_node_noprof - - - 0.01% [.] ::wake - - - 0.01% [k] update_curr_dl_se - - - 0.01% [.] ::next:: - - - 0.01% [.] ::next_expiration - - - 0.01% [.] ::sub_timespec - - - 0.01% [.] bytes::bytes_mut::shared_v_drop - - - 0.01% [.] ::decode_chain - - - 0.01% [.] ring_core_0_17_14__gcm_ghash_vpclmulqdq_avx2_1 - - - 0.01% [.] as core::fmt::Write>::write_str - - - 0.00% [k] _find_next_bit - - - 0.00% [k] update_entity_lag - - - 0.00% [k] psi_task_change - - - 0.00% [k] ktime_get_ts64 - - - 0.00% [.] ::new - - - 0.00% [.] ::kind - - - 0.00% [k] bit_putcs - - - 0.00% [.] ::provide:: - - - 0.00% [k] css_rstat_updated - - - 0.00% [.] >::insert - - - 0.00% [.] >::recv::{closure#0}::{closure#0}> as core::future::future::Future>::poll - - - 0.00% [k] rw_verify_area - - - 0.00% [.] >>::remove:: - - - 0.00% [.] serde_json::ser::format_escaped_str_contents::<&mut alloc::vec::Vec, serde_json::ser::CompactFormatter> - - - 0.00% [.] ::next_expiration - - - 0.00% [k] select_estimate_accuracy - - - 0.00% [.] as tonic::codec::Decoder>::decode - - - 0.00% [.] >>::grow_one - - - 0.00% [.] ::drop - - - 0.00% [k] __enqueue_entity - - - 0.00% [.] ::decode - - - 0.00% [.] ring::aead::algorithm::aes_gcm_open - - - 0.00% [k] do_epoll_wait - - - 0.00% [.] ::sub - - - 0.00% [k] perf_ctx_enable - - - 0.00% [k] enqueue_task_fair - - - 0.00% [k] futex_ref_get - - - 0.00% [.] as core::ops::drop::Drop>::drop - - - 0.00% [.] as tokio::runtime::task::Schedule>::schedule - - - 0.00% [k] _raw_spin_lock_irqsave - - - 0.00% [.] as serde_core::de::Deserializer>::deserialize_any::<::deserialize::__Visitor> - - - 0.00% [k] stop_this_handle - - - 0.00% [k] __sys_recvfrom - - - 0.00% [k] perf_ctx_unlock - - - 0.00% [k] bch_alloc_sectors - - - 0.00% [.] rustls::msgs::message::outbound::read_opaque_message_header - - - 0.00% [.] ::parse_value - - - 0.00% [.] ::decrypt_incoming - - - 0.00% [.] syscall - - - 0.00% [k] inet_recvmsg - - - 0.00% [.] ring_core_0_17_14__aes_hw_ctr32_encrypt_blocks - - - 0.00% [k] psi_task_switch - - - 0.00% [k] __pick_eevdf - - - 0.00% [k] dequeue_task_fair - - - 0.00% [k] eventfd_poll - - - 0.00% [.] prost::encoding::merge_loop::::{closure#0}, &mut &mut tonic::codec::buffer::DecodeBuf> - - - 0.00% [.] ::enter - - - 0.00% [k] __put_user_nocheck_4 - - - 0.00% [.] ::clear_expired_reset_streams - - - 0.00% [.] + core::marker::Send>>, alloc::sync::Arc>>::poll - - - 0.00% [.] ::from_bytes_with_nul - - - 0.00% [.] ::poll_read_priv - - - 0.00% [k] tcp_cleanup_rbuf - - - 0.00% [.] ::poll_ready - - - 0.00% [k] selinux_ip_postroute_compat - - - 0.00% [.] ::transition_to_notified_by_val - - - 0.00% [.] , alloc::vec::Vec, ::decode_chain::{closure#0}> as core::iter::traits::iterator::Iterator>::next - - - 0.00% [.] core::ptr::drop_in_place:: - - - 0.00% [k] blkcg_maybe_throttle_current - - - 0.00% [.] as serde_core::de::DeserializeSeed>::deserialize:: - - - 0.00% [k] fsnotify - - - 0.00% [.] ::serialize:: - - - 0.00% [.] as core::ops::drop::Drop>::drop - - - 0.00% [.] ::checked_add - - - 0.00% [k] available_idle_cpu - - - 0.00% [k] merge_sched_in - - - 0.00% [.] ::merged - - - 0.00% [.] ::consume_connection_window - - - 0.00% [.] ::deserialize_any::< as serde_core::de::Deserialize>::deserialize::VecVisitor> - - - 0.00% [.] consciousness::config::config_path - - - 0.00% [k] fsnotify_peek_first_event - - - 0.00% [k] selinux_inode_permission - - - 0.00% [.] statx - - - 0.00% [.] as figment::coalesce::Coalescible>::coalesce - - - 0.00% [k] inotify_poll - - - 0.00% [.] ::reborrow - - - 0.00% [k] idle_cpu - - - 0.00% [k] irq_work_run_list - - - 0.00% [k] __pi_memcpy - - - 0.00% [k] sched_tick - - - 0.00% [k] account_user_time - - - 0.00% [k] hrtimer_interrupt - - - 0.00% [k] acct_account_cputime - - - 0.00% [k] calc_wheel_index - - - 0.00% [.] >::pop:: - - - 0.00% [k] _raw_spin_unlock - - - 0.00% [k] led_trigger_blink_oneshot - - - 0.00% [k] using_native_sched_clock - - - 0.00% [k] rb_insert_color - - - 0.00% [.] alloc_perturb - - - 0.00% [k] ahci_handle_port_intr - - - 0.00% [k] _find_next_and_bit - - - 0.00% [k] irq_enter_rcu - - - 0.00% [k] psi_flags_change - - - 0.00% [k] __note_gp_changes - - - 0.00% [k] tick_nohz_handler - - - 0.00% [.] ::event_loop_thread - - - 0.00% [k] hrtimer_try_to_cancel - - - 0.00% [k] sched_clock_stable - - - 0.00% [k] asm_sysvec_apic_timer_interrupt - - - 0.00% [k] rwb_arm_timer - - - 0.00% [k] rb_next - - - 0.00% [k] irqentry_enter - - - 0.00% [k] __remove_hrtimer - - - 0.00% [k] super_written - - - 0.00% [k] mod_timer - - - 0.00% [k] update_sd_lb_stats.constprop.0 - - - 0.00% [.] , h2::proto::streams::prioritize::Prioritized>> as futures_core::stream::Stream>::poll_next - - - 0.00% [.] , h2::client::Peer>>::clear_expired_reset_streams - - - 0.00% [.] ::maintenance - - - 0.00% [k] __alloc_skb - - - 0.00% [k] __pmu_ctx_sched_out - - - 0.00% [k] igb_poll - - - 0.00% [k] tcp_stream_memory_free - - - 0.00% [.] ::record_data - - - 0.00% [k] netif_skb_features - - - 0.00% [.] ::serialize::<&mut serde_json::ser::Serializer<&mut alloc::vec::Vec>> - - - 0.00% [k] sd_uninit_command - - - 0.00% [.] >::send - - - 0.00% [k] inotify_read - - - 0.00% [k] try_to_wake_up - - - 0.00% [k] lookup_fast - - - 0.00% [.] ::new - - - 0.00% [k] __cgroup_account_cputime - - - 0.00% [k] blk_flush_complete_seq - - - 0.00% [.] ::get_segment - - - 0.00% [.] ::from_capnp - - - 0.00% [k] _raw_spin_rq_lock_irqsave - - - 0.00% [k] native_apic_mem_eoi - - - 0.00% [k] terminate_walk - - - 0.00% [.] >::child_for_key::<&[u8]> - - - 0.00% [.] core::ptr::drop_in_place:: - - - 0.00% [k] scsi_finish_command - - - 0.00% [k] __memcg_slab_free_hook - - - 0.00% [k] ext4_release_file - - - 0.00% [k] scsi_decide_disposition - - - 0.00% [k] update_cfs_group - - - 0.00% [.] consciousness::hippocampus::store::capnp::read_text - - - 0.00% [k] mempool_free_slab - - - 0.00% [.] >::range::<&[u8], core::ops::range::RangeInclusive<&[u8]>> - - - 0.00% [.] std::sys::env::unix::getenv::{closure#0} - - - 0.00% [.] core::slice::sort::shared::pivot::median3_rec::<(alloc::string::String, consciousness::locks::LockStats), <[(alloc::string::String, consciousness::locks::LockStats)]>::sort_by::{closure#0}> - - - 0.00% [k] ext4_wait_block_bitmap - - - 0.00% [.] ::deserialize_any< as serde_core::de::Deserialize>::deserialize::VecVisitor>::{closure#0}> as serde_core::de::SeqAccess>::next_element_seed::> - - - 0.00% [.] ::new - - - 0.00% [.] core::fmt::float::float_to_decimal_common_shortest:: - - - 0.00% [.] getenv - - - 0.00% [.] ::_join - - - 0.00% [k] irqentry_exit_to_user_mode - - - 0.00% [.] as capnp::private::arena::ReaderArena>::check_offset - - - 0.00% [k] ext4_get_inode_loc - - - 0.00% [.] >::insert - - - 0.00% [k] fsnotify_open_perm_and_set_mode - - - 0.00% [.] ::avg_clustering_coefficient - - - 0.00% [k] tomoyo_init_request_info - - - 0.00% [.] ::fmt - - - 0.00% [.] >::pop - - - 0.00% [k] __list_add_valid_or_report - - - 0.00% [.] ::load - - - 0.00% [k] netdev_pick_tx - - - 0.00% [k] ata_qc_complete_multiple - - - 0.00% [k] do_filp_open - - - 0.00% [.] as serde_core::de::Deserializer>::deserialize_any::<::deserialize::__Visitor> - - - 0.00% [.] ::register_by_ref - - - 0.00% [.] ::poll_flush - - - 0.00% [k] dev_gro_receive - - - 0.00% [.] as alloc::vec::spec_from_iter::SpecFromIter<(alloc::string::String, figment::value::value::Value), core::iter::adapters::map::Map, >>::from::{closure#0}>>>::from_iter - - - 0.00% [.] as core::iter::traits::iterator::Iterator>::fold::<(), core::iter::adapters::map::map_fold<(char, isize), char, (), ::transform_range>::{closure#1}, core::iter::traits::iterator::Iterator::for_each::call>::extend, ::transform_range>::{closure#1}>>::{closure#0}>::{closure#0}>::{closure#0}> - - - 0.00% [k] step_into - - - 0.00% [k] refill_stock - - - 0.00% [.] core::ptr::drop_in_place:: - - - 0.00% [k] strncpy_from_user - - - 0.00% [.] ::poll_data - - - 0.00% [.] ::discard - - - 0.00% [.] <&dyn core::fmt::Debug as core::fmt::Debug>::fmt - - - 0.00% [.] <&mut ::merge_all::{closure#0} as core::ops::function::FnMut<((usize, &[tokenizers::models::bpe::word::Symbol]),)>>::call_mut - - - 0.00% [.] >::try_with::, tokio::task::coop::poll_proceed::{closure#0}>::{closure#0}, core::task::poll::Poll> - - - 0.00% [k] pick_task_fair - - - 0.00% [.] std::sys::fs::unix::try_statx - - - 0.00% [k] futex_hash_put - - - 0.00% [k] vfs_read - - - 0.00% [.] ::write_str - - - 0.00% [k] avc_has_perm_noaudit - - - 0.00% [.] as core::fmt::Debug>::fmt - - - 0.00% [k] kmem_cache_alloc_noprof - - - 0.00% [.] ::notify_parked_local - - - 0.00% [.] core::slice::sort::shared::smallsort::insertion_sort_shift_left::<(alloc::string::String, figment::value::value::Value), <[(alloc::string::String, figment::value::value::Value)]>::sort_by< as core::iter::traits::collect::FromIterator<(alloc::string::String, figment::value::value::Value)>>::from_iter, >>::from::{closure#0}>>::{closure#0}>::{closure#0}> - - - 0.00% [.] core::num::imp::flt2dec::strategy::grisu::format_shortest_opt - - - 0.00% [.] ::tokenize - - - 0.00% [k] __smp_call_single_queue - - - 0.00% [k] __fput - - - 0.00% [k] __napi_build_skb - - - 0.00% [k] set_next_buddy - - - 0.00% [k] selinux_file_open - - - 0.00% [k] igb_msix_ring - - - 0.00% [k] copy_from_kernel_nofault - - - 0.00% [.] core::ptr::drop_in_place:: - - - 0.00% [.] >::get:: - - - 0.00% [.] ::end - - - 0.00% [.] ::value_start - - - 0.00% [.] >::reserve::do_reserve_and_handle:: - - - 0.00% [k] bpf_lsm_socket_recvmsg - - - 0.00% [.] >::new - - - 0.00% [.] < as serde_core::de::Deserialize>::deserialize::VecVisitor as serde_core::de::Visitor>::visit_seq::<&mut serde_json::value::de::SeqDeserializer> - - - 0.00% [k] mutex_lock - - - 0.00% [k] __ip_finish_output - - - 0.00% [.] ::filled_mut - - - 0.00% [.] ::is_special_token - - - 0.00% [k] ip_queue_xmit - - - 0.00% [k] simple_copy_to_iter - - - 0.00% [k] tcp_poll - - - 0.00% [.] ::from_utf8_lossy - - - 0.00% [.] unicode_normalization_alignments::lookups::composition_table - - - 0.00% [.] as core::ops::drop::Drop>::drop - - - 0.00% [.] >::dying_next - - - 0.00% [.] ::deserialize:: - - - 0.00% [k] ksys_read - - - 0.00% [k] ip_skb_dst_mtu - - - 0.00% [.] as alloc::vec::spec_from_iter::SpecFromIter<(alloc::string::String, figment::value::value::Value), core::iter::adapters::map::Map, >>::from::{closure#0}>>>::from_iter - - - 0.00% [k] do_mkdirat - - - 0.00% [k] __tcp_transmit_skb - - - 0.00% [k] selinux_ip_output - - - 0.00% [.] ::write - - - 0.00% [.] as alloc::vec::spec_from_iter::SpecFromIter, >::decode::{closure#0}>>>::from_iter - - - 0.00% [.] ::hash_one::<&&core::panic::location::Location> - - - 0.00% [.] __syscall_cancel - - - 0.00% [k] __x64_sys_futex - - - 0.00% [.] consciousness::agent::context::scan_close_tag - - - 0.00% [k] path_get - - - 0.00% [k] dev_hard_start_xmit - - - 0.00% [k] hrtimer_try_to_cancel.part.0 - - - 0.00% [.] , notify::error::Error>>>::recv::{closure#1} - - - 0.00% [k] bpf_lsm_inode_permission - - - 0.00% [.] alloc::fmt::format::format_inner - - - 0.00% [k] add_transaction_credits - - - 0.00% [k] sched_clock_noinstr - - - 0.00% [.] ::debug_struct_field3_finish - - - 0.00% [.] ::deserialize_any::<::deserialize::__Visitor> - - - 0.00% [k] selinux_file_permission - - - 0.00% [.] ::slice::> - - - 0.00% [.] , notify::error::Error>>>::send - - - 0.00% [.] <&std::fs::File as std::io::Write>::write - - - 0.00% [.] >::pop - - - 0.00% [k] __list_del_entry_valid_or_report - - - 0.00% [.] as core::ops::drop::Drop>::drop - - - 0.00% [.] ::consume - - - 0.00% [.] ::nfc - - - 0.00% [k] __put_user_8 - - - 0.00% [.] ::is_contained_in - - - 0.00% [.] ::now - - - 0.00% [.] ::release_capacity - - - 0.00% [k] avg_vruntime - - - 0.00% [.] ::next - - - 0.00% [.] ::end_processing_scheduled_tasks - - - 0.00% [.] ::run::{closure#0}, alloc::sync::Arc>>::poll - - - 0.00% [k] __skb_datagram_iter - - - 0.00% [k] ip_sublist_rcv - - - 0.00% [.] ::write - - - 0.00% [k] file_has_perm - - - 0.00% [.] clock_gettime@@GLIBC_2.17 - - - 0.00% [.] ::pad - - - 0.00% [.] as anyhow::Context<(), capnp::Error>>::with_context::::append_relations::{closure#0}> - - - 0.00% [.] ::key_unchecked - - - 0.00% [.] ::release_connection_capacity - - - 0.00% [k] switch_fpu_return - - - 0.00% [.] ::saturating_duration_since - - - 0.00% [.] >, rustls::client::client_conn::connection::ClientConnection>>::read_io - - - 0.00% [.] ::has_message_ready - - - 0.00% [.] core::ptr::drop_in_place:: - - - 0.00% [.] ::flush - - - 0.00% [k] perf_event_groups_next - - - 0.00% [k] __futex_hash - - - 0.00% [k] sock_recvmsg - - - 0.00% [.] json_five::de::from_str:: - - - 0.00% [.] ::deserialize_any:: - - - 0.00% [k] xas_start - - - 0.00% [.] ::wake - - - 0.00% [.] ::next_match - - - 0.00% [.] match_at - - - 0.00% [k] __tcp_select_window - - - 0.00% [k] tcp_recvmsg - - - 0.00% [k] ext4_ext_insert_extent - - - 0.00% [.] ::wake - - - 0.00% [k] tcp_established_options - - - 0.00% [.] serde_json::value::de::visit_array::< as serde_core::de::Deserialize>::deserialize::VecVisitor> - - - 0.00% [k] __hrtimer_setup - - - 0.00% [k] common_interrupt - - - 0.00% [.] >::add - - - 0.00% [.] ::park_condvar - - - 0.00% [k] ip_send_check - - - 0.00% [k] igb_xmit_frame - - - 0.00% [.] ::neighbors - - - 0.00% [.] _int_realloc - - - 0.00% [k] nf_hook_slow - - - 0.00% [.] <&mio::net::tcp::stream::TcpStream as std::io::Read>::read - - - 0.00% [k] skb_try_coalesce - - - 0.00% [.] ::fmt - - - 0.00% [.] >>::recv_data - - - 0.00% [.] ::finish_grow - - - 0.00% [.] ::parse_next_component_back - - - 0.00% [k] tcp_update_recv_tstamps - - - 0.00% [k] _copy_from_user - - - 0.00% [k] rcu_note_context_switch - - - 0.00% [.] ::try_reserve_exact - - - 0.00% [.] __GI___libc_write - - - 0.00% [k] fdget_pos - - - 0.00% [.] ::send_pending_go_away::, h2::proto::streams::prioritize::Prioritized>> - - - 0.00% [k] folio_mark_accessed - - - 0.00% [.] core::ptr::drop_in_place::> - - - 0.00% [.] ::poll_data - - - 0.00% [.] >::into_owned - - - 0.00% [.] >>> as hyper::rt::io::Write>::poll_flush - - - 0.00% [k] cyc2ns_read_begin - - - 0.00% [k] skb_defer_free_flush - - - 0.00% [.] ::send_pending_pong::, h2::proto::streams::prioritize::Prioritized>> - - - 0.00% [k] kmem_cache_alloc_bulk_noprof - - - 0.00% [k] __ext4_journal_get_write_access - - - 0.00% [.] >>::insert - - - 0.00% [k] net_rx_action - - - 0.00% [.] >, ::pre_tokenize::{closure#1}::{closure#0}> as core::iter::traits::iterator::Iterator>::fold::<(), core::iter::traits::iterator::Iterator::for_each::call<(char, isize), >::extend_trusted>, ::pre_tokenize::{closure#1}::{closure#0}>>::{closure#0}>::{closure#0}> - - - 0.00% [.] > as core::future::future::Future>::poll - - - 0.00% [.] ::drop - - - 0.00% [k] get_nohz_timer_target - - - 0.00% [.] core::ptr::drop_in_place::> - - - 0.00% [.] as rustls::conn::connection::PlaintextSink>::flush - - - 0.00% [.] ::process - - - 0.00% [.] ::register - - - 0.00% [k] blk_attempt_bio_merge.part.0 - - - 0.00% [.] ::open_within - - - 0.00% [k] cyc2ns_read_end - - - 0.00% [.] __libc_recv - - - 0.00% [.] ::park_driver - - - 0.00% [k] ksys_write - - - 0.00% [k] wbc_detach_inode - - - 0.00% [.] bytes::bytes::static_drop - - - 0.00% [k] arch_perf_update_userpage - - - 0.00% [.] ::push_back:: - - - 0.00% [k] lock_sock_nested - - - 0.00% [k] __bitmap_and - - - 0.00% [.] core::ptr::drop_in_place::, alloc::vec::Vec, ::decode_chain::{closure#0}>> - - - 0.00% [.] ::ensure_recv_open - - - 0.00% [k] ip_finish_output2 - - - 0.00% [.] as bytes::buf::buf_impl::Buf>::get_uint - - - 0.00% [.] ::eq - - - 0.00% [k] tcp_recv_timestamp - - - 0.00% [k] futex_wait - - - 0.00% [.] >::try_with::, tokio::task::coop::poll_proceed::{closure#0}>::{closure#0}, core::task::poll::Poll> - - - 0.00% [.] as core::ops::drop::Drop>::drop - - - 0.00% [k] __x64_sys_read - - - 0.00% [.] , alloc::collections::btree::node::marker::KV>>::remove_leaf_kv::<>::remove_kv::{closure#0}, alloc::alloc::Global> - - - 0.00% [k] path_openat - - - 0.00% [k] queue_work_on - - - 0.00% [k] alloc_file_pseudo - - - 0.00% [.] ::add_path - - - 0.00% [.] ::park_timeout - - - 0.00% [k] plist_del - - - 0.00% [k] __slab_free - - - 0.00% [k] wbt_data_dir - - - 0.00% [.] , notify::error::Error>>>::recv - - - 0.00% [k] _find_next_or_bit - - - 0.00% [k] unix_stream_recvmsg - - - 0.00% [k] native_write_msr - - - 0.00% [k] __percpu_counter_sum - - - 0.00% [k] nmi_restore - - - 0.00% [k] calc_timer_values - - - 0.00% [k] pv_native_write_cr2 - - - 0.00% [k] blk_mq_complete_request_remote - - - 0.00% [.] >::recv - - - 0.00% [k] perf_event_update_userpage - - - 0.00% [k] entry_SYSCALL_64_after_hwframe - - - 0.00% [k] exc_nmi - - - 0.00% [k] end_repeat_nmi - - - 0.00% [k] entry_SYSRETQ_unsafe_stack - - - 0.00% [.] __syscall_cancel_arch - - - 0.00% [k] amd_cc_platform_has - - - 0.00% [k] local_clock_noinstr - - - 0.00% [k] perf_event_idx_default - - - 0.00% [k] local_clock - - - 0.00% [k] nmi_handle.part.0 - - - - -# -# (Tip: To see how parallelism changes over time, try: perf report -F time,latency,parallelism --time-quantum=1s) -# diff --git a/sa-schedule-aligned-variation.py b/sa-schedule-aligned-variation.py deleted file mode 100644 index 405ee2b..0000000 --- a/sa-schedule-aligned-variation.py +++ /dev/null @@ -1,200 +0,0 @@ -"""After applying Procrustes alignment to remove known gauge freedoms -(per-head d_h rotation tying Q/K/V/O, per-layer d_ff rotation tying -gate/up/down), measure per-family cos-sim between adjacent layers across -the whole network. - -Runs Procrustes SVDs on GPU for speed. -""" -import argparse -import json -import numpy as np -import torch -from transformers import AutoModelForCausalLM - - -def procrustes_gpu(M): - """Orthogonal R maximizing tr(R M). R = U V^T where M = U Σ V^T. - M on GPU; returns R on GPU.""" - U, _, Vh = torch.linalg.svd(M, full_matrices=False) - return U @ Vh - - -def frob_gpu(x): - return float(torch.linalg.norm(x).item()) - - -def normalize_fro_gpu(x, eps=1e-12): - n = torch.linalg.norm(x) - return x / n.clamp_min(eps) - - -@torch.no_grad() -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("--model", default="Qwen/Qwen3-4B") - ap.add_argument("--out", default="/tmp/sa-aligned-variation.json") - ap.add_argument("--device", default="cuda") - ap.add_argument("--pairs", default="", - help="Comma-separated list of L indices to run pair (L, L+1) for. " - "Empty = all pairs. E.g. '0,20,30,38,46,52,57' samples phases.") - args = ap.parse_args() - - dev = torch.device(args.device) - print(f"Loading {args.model} ...", flush=True) - model = AutoModelForCausalLM.from_pretrained( - args.model, - torch_dtype=torch.float32, - device_map="cpu", - trust_remote_code=True, - attn_implementation="eager", - ) - cfg = model.config - num_layers = cfg.num_hidden_layers - num_heads = cfg.num_attention_heads - num_kv_heads = getattr(cfg, "num_key_value_heads", num_heads) - hidden = cfg.hidden_size - head_dim = getattr(cfg, "head_dim", hidden // num_heads) - intermediate = cfg.intermediate_size - print(f" L={num_layers} H={num_heads} kv={num_kv_heads} hd={head_dim} " - f"hidden={hidden} ff={intermediate}", flush=True) - - # Collect per-layer weights - layers = [] - for L in range(num_layers): - layer = model.model.layers[L] - attn = layer.self_attn - mlp = layer.mlp - layers.append({ - "q_proj": attn.q_proj.weight.detach().float(), - "k_proj": attn.k_proj.weight.detach().float(), - "v_proj": attn.v_proj.weight.detach().float(), - "o_proj": attn.o_proj.weight.detach().float(), - "gate_proj": mlp.gate_proj.weight.detach().float(), - "up_proj": mlp.up_proj.weight.detach().float(), - "down_proj": mlp.down_proj.weight.detach().float(), - }) - del model - - # Per-adjacent-pair analysis - aligned_cos = {fam: {} for fam in - ["q_proj", "k_proj", "v_proj", "o_proj", - "gate_proj", "up_proj", "down_proj"]} - - if args.pairs: - pair_L_list = [int(x) for x in args.pairs.split(",")] - else: - pair_L_list = list(range(num_layers - 1)) - - for L in pair_L_list: - A = layers[L] - B = layers[L + 1] - - # -------- Per-head attention alignment (d_h × d_h) -------- - Qa = A["q_proj"].to(dev).reshape(num_heads, head_dim, hidden) - Qb = B["q_proj"].to(dev).reshape(num_heads, head_dim, hidden) - Ka = A["k_proj"].to(dev).reshape(num_kv_heads, head_dim, hidden) - Kb = B["k_proj"].to(dev).reshape(num_kv_heads, head_dim, hidden) - Va = A["v_proj"].to(dev).reshape(num_kv_heads, head_dim, hidden) - Vb = B["v_proj"].to(dev).reshape(num_kv_heads, head_dim, hidden) - # o_proj is (hidden, num_heads*head_dim); split per head - Oa = A["o_proj"].to(dev).reshape(hidden, num_heads, head_dim).permute(1, 0, 2).contiguous() - Ob = B["o_proj"].to(dev).reshape(hidden, num_heads, head_dim).permute(1, 0, 2).contiguous() - # (num_heads, hidden, head_dim) - - q_cos = [] - k_cos = [] - v_cos = [] - o_cos = [] - for h in range(num_heads): - kv_h = (h * num_kv_heads) // num_heads - qa = normalize_fro_gpu(Qa[h]) - qb = normalize_fro_gpu(Qb[h]) - ka = normalize_fro_gpu(Ka[kv_h]) - kb = normalize_fro_gpu(Kb[kv_h]) - va = normalize_fro_gpu(Va[kv_h]) - vb = normalize_fro_gpu(Vb[kv_h]) - oa = normalize_fro_gpu(Oa[h]) - ob = normalize_fro_gpu(Ob[h]) - - # Cross-correlation for joint alignment: we want R s.t. - # R qa ≈ qb (etc), minimize sum of ||R X_a - X_b||² → - # max tr(R M) with M = qa qb^T + ka kb^T + va vb^T + oa^T ob - M = qa @ qb.T + ka @ kb.T + va @ vb.T + oa.T @ ob - R = procrustes_gpu(M) - - # Post-alignment cos-sim (since matrices unit-normalized, cos - # = = tr(qb^T R qa) = tr(R qa qb^T)) - q_cos.append(float(torch.sum(R @ qa * qb).item())) - k_cos.append(float(torch.sum(R @ ka * kb).item())) - v_cos.append(float(torch.sum(R @ va * vb).item())) - # For O: O after rotation = oa R^T; cos = - o_cos.append(float(torch.sum(oa @ R.T * ob).item())) - - aligned_cos["q_proj"][L] = float(np.mean(q_cos)) - aligned_cos["k_proj"][L] = float(np.mean(k_cos)) - aligned_cos["v_proj"][L] = float(np.mean(v_cos)) - aligned_cos["o_proj"][L] = float(np.mean(o_cos)) - - # -------- d_ff × d_ff alignment for gate/up/down -------- - ga = normalize_fro_gpu(A["gate_proj"].to(dev)) - gb = normalize_fro_gpu(B["gate_proj"].to(dev)) - ua = normalize_fro_gpu(A["up_proj"].to(dev)) - ub = normalize_fro_gpu(B["up_proj"].to(dev)) - da = normalize_fro_gpu(A["down_proj"].to(dev)) # (hidden, d_ff) - db = normalize_fro_gpu(B["down_proj"].to(dev)) - - # All of ga, gb, ua, ub are (d_ff, hidden); da, db are (hidden, d_ff) - # Cross-correlation: M = ga gb^T + ua ub^T + da^T db (d_ff × d_ff) - M_ff = ga @ gb.T + ua @ ub.T + da.T @ db - S = procrustes_gpu(M_ff) - - aligned_cos["gate_proj"][L] = float(torch.sum(S @ ga * gb).item()) - aligned_cos["up_proj"][L] = float(torch.sum(S @ ua * ub).item()) - aligned_cos["down_proj"][L] = float(torch.sum(da @ S.T * db).item()) - - # Free GPU memory - del Qa, Qb, Ka, Kb, Va, Vb, Oa, Ob - del ga, gb, ua, ub, da, db, M_ff, S - torch.cuda.empty_cache() - - print(f" done pair L={L}->L+1 " - f"(q={aligned_cos['q_proj'][L]:+.4f} gate={aligned_cos['gate_proj'][L]:+.4f})", - flush=True) - - # Report - print("\n=== Adjacent-layer cos-sim AFTER Procrustes alignment ===\n") - print(" cos=1 means identical after gauge rotation; cos=0 means orthogonal\n") - header = " L " - for fam in aligned_cos: - header += f" {fam:>12}" - print(header) - for L in sorted(pair_L_list): - if L not in aligned_cos["q_proj"]: - continue - row = f" {L:>2}" - for fam in aligned_cos: - row += f" {aligned_cos[fam][L]:+12.4f}" - print(row) - - print("\n=== Per-family summary (aligned) ===") - print(f" {'family':>14} {'mean_cos':>10} {'median_cos':>11} " - f"{'aligned_resid':>14}") - for fam, vals_dict in aligned_cos.items(): - vs = np.array(list(vals_dict.values())) - if len(vs) == 0: - continue - resid = float(np.sqrt(np.maximum(1.0 - vs**2, 0.0)).mean()) - print(f" {fam:>14} {vs.mean():>+10.4f} {np.median(vs):>+11.4f} " - f"{resid:>14.4f}") - - with open(args.out, "w") as f: - json.dump({ - "model": args.model, - "num_layers": num_layers, - "aligned_cos": aligned_cos, - }, f, indent=2) - print(f"\nSaved: {args.out}") - - -if __name__ == "__main__": - main() diff --git a/sa-schedule-analyze-aligned.py b/sa-schedule-analyze-aligned.py deleted file mode 100644 index 919de10..0000000 --- a/sa-schedule-analyze-aligned.py +++ /dev/null @@ -1,157 +0,0 @@ -"""Analyze aligned_variation output to answer the training-artifact vs -specialization question. - -Inputs: qwen3-*-null.json (raw cos-sim) + qwen3-*-aligned.json (aligned cos-sim) - -For each layer pair where aligned data exists, compare: - raw_cos(L) — before Procrustes alignment - aligned_cos(L) — after Procrustes alignment - delta = aligned_cos - raw_cos - -If delta is substantial (aligned much larger than raw), rotation gauge -was hiding shared structure → training-artifact hypothesis supported. -If delta ≈ 0, specialization is real (rotation can't find shared -structure because there isn't any). - -Stratify by phase to test prediction that LATE layers have LARGER delta -(more rotation-gauge noise, less real specialization). -""" -import argparse -import json -import numpy as np - - -def phase_of(L, num_layers): - """Rough phase assignment based on measured 32B entropy boundaries. - For other models we'd refit — but shape should be similar.""" - if num_layers == 64: # Qwen3-32B - if L <= 6: - return "A" - elif L <= 9: - return "B" - elif L <= 31: - return "C" - elif L <= 46: - return "D" - elif L <= 58: - return "E" - else: - return "tail" - elif num_layers == 36: # Qwen3-4B - if L <= 6: - return "A" - elif L <= 9: - return "B" - elif L <= 23: - return "C" - elif L <= 33: - return "D" - else: - return "tail" - else: - frac = L / num_layers - if frac < 0.11: - return "A" - elif frac < 0.15: - return "B" - elif frac < 0.5: - return "C" - elif frac < 0.75: - return "D" - elif frac < 0.92: - return "E" - else: - return "tail" - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("null_json", help="output of null_residual.py") - ap.add_argument("aligned_json", help="output of aligned_variation.py") - args = ap.parse_args() - - null = json.load(open(args.null_json)) - aligned = json.load(open(args.aligned_json)) - - num_layers = aligned["num_layers"] - aligned_cos = aligned["aligned_cos"] # dict: family -> {L: cos} - pair_results = null["pair_results"] # list of {L, L_next, families: {family: {cos, ...}}} - - # Build raw_cos dict from null output - raw_cos = {fam: {} for fam in ["q_proj", "k_proj", "v_proj", "o_proj", - "gate_proj", "up_proj", "down_proj"]} - for pr in pair_results: - L = pr["L"] - for fam in raw_cos: - if fam in pr["families"]: - raw_cos[fam][L] = pr["families"][fam]["cos"] - - print(f"=== Aligned vs Raw cos-sim comparison ({args.aligned_json}) ===") - print(f" {num_layers} layers total; aligned data for " - f"{len(aligned_cos['q_proj'])} pairs\n") - - # Per-pair table: L, phase, family cos-sims raw and aligned - families = ["q_proj", "k_proj", "v_proj", "o_proj", - "gate_proj", "up_proj", "down_proj"] - - print(f" {'L':>3} {'phase':>5}", end="") - for fam in families: - print(f" {fam+'_raw':>10} {fam+'_ali':>10}", end="") - print() - - L_keys = sorted([int(L) for L in aligned_cos["q_proj"].keys()]) - for L in L_keys: - Lstr = str(L) - phase = phase_of(L, num_layers) - row = f" {L:>3} {phase:>5}" - for fam in families: - r = raw_cos[fam].get(L, None) - a = aligned_cos[fam].get(Lstr, None) - rstr = f"{r:+10.4f}" if r is not None else " N/A" - astr = f"{a:+10.4f}" if a is not None else " N/A" - row += f" {rstr} {astr}" - print(row) - - # Aggregate by phase: mean (aligned - raw) per family per phase - print("\n=== Per-phase mean delta (aligned_cos - raw_cos) by family ===") - print(f" Large positive delta = rotation alignment revealed shared") - print(f" structure. Small delta = specialization is gauge-independent.\n") - - phase_deltas = {} - for L in L_keys: - Lstr = str(L) - ph = phase_of(L, num_layers) - for fam in families: - r = raw_cos[fam].get(L, None) - a = aligned_cos[fam].get(Lstr, None) - if r is not None and a is not None: - phase_deltas.setdefault(ph, {}).setdefault(fam, []).append(a - r) - - print(f" {'phase':>6}", end="") - for fam in families: - print(f" {fam:>10}", end="") - print() - for ph in sorted(phase_deltas.keys()): - print(f" {ph:>6}", end="") - for fam in families: - vals = phase_deltas[ph].get(fam, []) - if vals: - print(f" {np.mean(vals):+10.4f}", end="") - else: - print(f" {'—':>10}", end="") - print() - - # Interpretation - print("\n=== Interpretation ===") - print(" Prediction under training-artifact hypothesis:") - print(" delta(Phase E) > delta(Phase C) for projection families") - print(" → late layers have more rotation-gauge-hidden structure") - print(" → specialization is partly training noise, not structural") - print("") - print(" Prediction under real-specialization hypothesis:") - print(" delta ~ 0 everywhere") - print(" → layers genuinely point in different directions, gauge irrelevant") - - -if __name__ == "__main__": - main() diff --git a/sa-schedule-analyze-grams.py b/sa-schedule-analyze-grams.py deleted file mode 100644 index b4cdc4e..0000000 --- a/sa-schedule-analyze-grams.py +++ /dev/null @@ -1,168 +0,0 @@ -"""Analyze operator-level inter-layer alignment from the grams + eigdirs files. - -Input: - qwen3-4b-grams.json (gram[L,L',h], fro_sq[L,h]) - qwen3-4b-grams-eigdirs.pt (eig_dirs[L,h,topk,hidden], sym_eigs[L,h,2*head_dim]) - -Questions: - (a) Operator cos-sim between layers. cos(g_L^h, g_L'^h) = gram / √(fro_sq fro_sq'). - If ~1 → same operator up to scalar. If low → distinct operators. - (b) Scalar-rescale residual using full operator (not spectrum): - optimal T = gram / fro_sq', residual_frac = √(1 - cos²). - (c) Curvature-sign alignment. For each (L, anchor) pair, what fraction of - top-k signed eigenvalues share sign with the anchor's? - (d) Top-k eigensubspace alignment. Principal angles between span{eig_dirs_L} - and span{eig_dirs_anchor}. - - Compare: operator cos-sim vs spectral cos-sim (from prior analysis). The - sheaf-rs finding was that spectral shape converges across layers while - eigenvectors don't. We want to confirm/refute that within QK in Qwen3-4B. -""" -import argparse -import json -import numpy as np -import torch - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("gram_json") - ap.add_argument("--anchor", type=int, default=-1, - help="anchor layer index; -1 = last") - args = ap.parse_args() - - with open(args.gram_json) as f: - d = json.load(f) - num_layers = d["num_layers"] - num_heads = d["num_heads"] - head_dim = d["head_dim"] - hidden = d["hidden_size"] - gram = np.array(d["gram"]) # (L, L, H) - # NOTE: fro_sq from the json is ||W_K W_Q^T||_F^2 (the measure.py - # shortcut), NOT ||g||_F^2 = ||W_K^T W_Q||_F^2 which is what the gram - # diagonal gives. Different objects. Use gram diagonal for normalization. - diag_sq = np.array([[gram[L, L, h] for h in range(num_heads)] - for L in range(num_layers)]) # (L, H) - diag = np.sqrt(np.maximum(diag_sq, 1e-20)) # ||g_L^h||_F - - pt = torch.load(d.get("eigdirs_path", args.gram_json.replace(".json", "-eigdirs.pt")), - weights_only=True) - eig_dirs = pt["eig_dirs"].double().numpy() # (L, H, topk, hidden) - sym_eigs = pt["sym_eigs"].double().numpy() # (L, H, 2*head_dim) - topk = eig_dirs.shape[2] - anchor = args.anchor if args.anchor >= 0 else num_layers - 1 - - # ========================================================== - # (a) Operator cos-sim matrix, averaged over heads - # ========================================================== - cos_mat = np.zeros((num_layers, num_layers)) - for L in range(num_layers): - for Lp in range(num_layers): - denom = diag[L] * diag[Lp] - per_h = gram[L, Lp] / np.maximum(denom, 1e-20) - cos_mat[L, Lp] = per_h.mean() - - print(f"=== (a) Operator cos-sim between layers, averaged over {num_heads} heads ===") - print(f" diagonal (should be 1.0): mean {np.diag(cos_mat).mean():.4f}") - # Adjacent-layer cos-sim - adj = np.array([cos_mat[L, L+1] for L in range(num_layers-1)]) - print(f" adjacent layers cos-sim: mean {adj.mean():.4f} min {adj.min():.4f} max {adj.max():.4f}") - # Layer-to-anchor cos-sim - to_anchor = cos_mat[:, anchor] - print(f" layer -> anchor L={anchor} cos-sim:") - print(f" {'L':>3} {'cos':>7} {'T_opt':>7} {'resid_frac':>10}") - for L in range(num_layers): - c = to_anchor[L] - T = float(np.mean(gram[L, anchor] / np.maximum(diag_sq[anchor], 1e-20))) - r = float(np.sqrt(max(1.0 - c**2, 0.0))) - print(f" {L:>3} {c:+.4f} {T:+7.3f} {r:>10.4f}") - - # Long-range cos-sim (L=0 to L=35 vs L=17 to L=35 etc.) - print(f"\n long-range: cos(L=0, last) = {cos_mat[0, -1]:+.3f} " - f"cos(L=midish, last) = {cos_mat[num_layers//2, -1]:+.3f}") - - # ========================================================== - # (b) Full scalar-rescale residual using the gram - # ========================================================== - print(f"\n=== (b) Operator-level scalar rescale to anchor L={anchor} ===") - # residual_frac² = 1 - cos²(g_L, g_anchor) (per head) - print(f" {'L':>3} {'mean_cos':>9} {'mean_resid':>10}") - for L in range(num_layers): - per_h_cos = gram[L, anchor] / np.maximum(diag[L] * diag[anchor], 1e-20) - per_h_resid = np.sqrt(np.clip(1.0 - per_h_cos**2, 0.0, 1.0)) - print(f" {L:>3} {per_h_cos.mean():>+9.4f} {per_h_resid.mean():>10.4f}") - - # ========================================================== - # (c) Curvature-sign alignment - # ========================================================== - print(f"\n=== (c) Curvature-sign alignment vs anchor L={anchor} ===") - # Look at top-k eigenvalues by magnitude (already sorted that way in measure). - # Fraction of top-k (L, h) whose sign matches the anchor's i-th eigenvalue. - for k_use in [2, 4, 8, 16, 32, 64, 128, 256]: - if k_use > sym_eigs.shape[-1]: - continue - # sign of top-k_use eigenvalues at layer L vs at anchor, per (L, h) - sign_L = np.sign(sym_eigs[:, :, :k_use]) # (L, H, k_use) - sign_a = np.sign(sym_eigs[anchor, :, :k_use]) # (H, k_use) - agree = (sign_L == sign_a[None, :, :]).mean(axis=-1) # (L, H) - print(f" top-{k_use:>3} signs: mean agree = {agree.mean():.3f} " - f"by layer range: early {agree[:12].mean():.3f} " - f"mid {agree[12:24].mean():.3f} late {agree[24:].mean():.3f}") - - # Also: distribution of sign-balance per layer (fraction positive eigenvalues) - frac_pos = (sym_eigs[:, :, :2 * head_dim] > 0).mean(axis=(1, 2)) - print(f"\n fraction positive eigenvalues per layer:") - for L in range(num_layers): - print(f" L={L:2} frac+ = {frac_pos[L]:.3f}") - - # ========================================================== - # (d) Eigenspace principal angles - # ========================================================== - print(f"\n=== (d) Top-{topk} eigensubspace principal angles vs anchor L={anchor} ===") - # Per-head: cos of principal angles between row-spans of eig_dirs[L, h] - # and eig_dirs[anchor, h]. Report mean cos angle per layer. - print(f" {'L':>3} {'meanCosPA':>10} {'minCosPA':>10} {'max_top1':>10}") - for L in range(num_layers): - mean_cos_pa_per_h = [] - min_cos_pa_per_h = [] - top1_overlap = [] - for h in range(num_heads): - A = eig_dirs[L, h] # (topk, hidden) rows are unit vectors - B = eig_dirs[anchor, h] # (topk, hidden) - # Orthonormalize rows (they're close-to-orthonormal but not exactly) - Qa, _ = np.linalg.qr(A.T) # hidden × topk - Qb, _ = np.linalg.qr(B.T) - M = Qa.T @ Qb # topk × topk - s = np.linalg.svd(M, compute_uv=False) - mean_cos_pa_per_h.append(s.mean()) - min_cos_pa_per_h.append(s.min()) - # ||² — top-1 eigenvector overlap - top1_overlap.append(float((A[0] @ B[0]) ** 2)) - print(f" {L:>3} {np.mean(mean_cos_pa_per_h):>10.4f} " - f"{np.mean(min_cos_pa_per_h):>10.4f} " - f"{np.mean(top1_overlap):>10.4f}") - - # ========================================================== - # Verdict - # ========================================================== - to_anchor_per_head = np.array([ - (gram[L, anchor] / np.maximum(diag[L] * diag[anchor], 1e-20)).mean() - for L in range(num_layers) - ]) - mean_cos_to_anchor = to_anchor_per_head.mean() - print(f"\n=== Verdict ===") - print(f" mean operator cos-sim to anchor: {mean_cos_to_anchor:+.4f}") - adj_mean = adj.mean() - print(f" mean operator cos-sim adjacent layers: {adj_mean:+.4f}") - if mean_cos_to_anchor > 0.9: - print(" STRONG: same operator up to scalar across all layers.") - elif mean_cos_to_anchor > 0.5: - print(" MEDIUM: substantial shared operator, but layer-specific drift.") - elif mean_cos_to_anchor > 0.1: - print(" WEAK: some alignment; far from single-operator interpretation.") - else: - print(" REJECTED: operators are effectively orthogonal across layers.") - - -if __name__ == "__main__": - main() diff --git a/sa-schedule-analyze.py b/sa-schedule-analyze.py deleted file mode 100644 index 65284f3..0000000 --- a/sa-schedule-analyze.py +++ /dev/null @@ -1,108 +0,0 @@ -"""Analyze the SA schedule readout JSON: per-head variance, static/dynamic -correlation, and a plot.""" -import argparse -import json -import numpy as np -import matplotlib -matplotlib.use("Agg") -import matplotlib.pyplot as plt - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("input_json") - ap.add_argument("--out-plot", default="/tmp/sa-schedule.png") - args = ap.parse_args() - - with open(args.input_json) as f: - data = json.load(f) - - num_layers = data["num_layers"] - num_heads = data["num_heads"] - Ls = np.arange(num_layers) - - ent = np.array([row["mean_attention_entropy_per_head"] for row in data["dynamic"]]) # (L, H) - logit_std = np.array([row["mean_logit_std_per_head"] for row in data["dynamic"]]) # (L, H) - metric_op = np.array([row["metric_op_per_head"] for row in data["static"]]) # (L, H) - metric_fro = np.array([row["metric_fro_per_head"] for row in data["static"]]) - - mean_ent = ent.mean(axis=1) - std_ent = ent.std(axis=1) - mean_logit = logit_std.mean(axis=1) - std_logit = logit_std.std(axis=1) - mean_metric = metric_op.mean(axis=1) - std_metric = metric_op.std(axis=1) - - # Per-head variance summary - print("\nPer-head variance across heads (coefficient of variation = std/mean):") - print(f" entropy: mean CV = {(std_ent / np.maximum(mean_ent, 1e-6)).mean():.3f}") - print(f" logit_std: mean CV = {(std_logit / np.maximum(mean_logit, 1e-6)).mean():.3f}") - print(f" metric_op: mean CV = {(std_metric / np.maximum(mean_metric, 1e-6)).mean():.3f}") - - # Correlations across layers - corr_ent_metric = np.corrcoef(mean_ent, mean_metric)[0, 1] - corr_logit_metric = np.corrcoef(mean_logit, mean_metric)[0, 1] - corr_ent_logit = np.corrcoef(mean_ent, mean_logit)[0, 1] - print("\nAcross-layer Pearson correlations (averaged over heads):") - print(f" entropy vs metric_op: {corr_ent_metric:+.3f}") - print(f" logit_std vs metric_op: {corr_logit_metric:+.3f}") - print(f" entropy vs logit_std: {corr_ent_logit:+.3f}") - - # Per-head correlation (one value per head): does each head's entropy - # across layers track its own metric_op across layers? - head_corrs = [] - for h in range(num_heads): - c = np.corrcoef(ent[:, h], metric_op[:, h])[0, 1] - if np.isfinite(c): - head_corrs.append(c) - print(f" per-head entropy vs metric_op: mean {np.mean(head_corrs):+.3f} " - f"std {np.std(head_corrs):.3f} min {min(head_corrs):+.3f} max {max(head_corrs):+.3f}") - - # Plot - fig, axes = plt.subplots(3, 1, figsize=(10, 9), sharex=True) - - ax = axes[0] - ax.fill_between(Ls, mean_ent - std_ent, mean_ent + std_ent, alpha=0.2, color="tab:blue", - label="±1 std across heads") - ax.plot(Ls, mean_ent, color="tab:blue", marker="o", label="mean entropy") - ax.set_ylabel("attention entropy (nats)") - ax.set_title(f"{data['model']} — SA schedule readout ({num_layers} layers, {num_heads} heads)") - ax.legend(loc="upper right") - ax.grid(alpha=0.3) - - ax = axes[1] - ax.fill_between(Ls, mean_logit - std_logit, mean_logit + std_logit, alpha=0.2, color="tab:orange", - label="±1 std across heads") - ax.plot(Ls, mean_logit, color="tab:orange", marker="o", label="mean logit std") - ax.set_ylabel("pre-softmax logit std\n(= implicit sharpness)") - ax.legend(loc="upper right") - ax.grid(alpha=0.3) - - ax = axes[2] - ax.fill_between(Ls, mean_metric - std_metric, mean_metric + std_metric, alpha=0.2, color="tab:green", - label="±1 std across heads") - ax.plot(Ls, mean_metric, color="tab:green", marker="o", label="mean metric op-norm (static)") - ax.set_ylabel("||W_K^T W_Q|| operator norm\n(static, parameter-only)") - ax.set_xlabel("layer index L") - ax.legend(loc="upper right") - ax.grid(alpha=0.3) - - plt.tight_layout() - plt.savefig(args.out_plot, dpi=100, bbox_inches="tight") - print(f"\nWrote plot to {args.out_plot}") - - # Also save a small heatmap of per-head entropy for visual spread - plt.figure(figsize=(10, 6)) - plt.imshow(ent.T, aspect="auto", cmap="viridis", origin="lower") - plt.colorbar(label="attention entropy") - plt.xlabel("layer L") - plt.ylabel("head h") - plt.title(f"{data['model']} — per-head entropy heatmap") - heatmap_path = args.out_plot.replace(".png", "-heatmap.png") - plt.tight_layout() - plt.savefig(heatmap_path, dpi=100, bbox_inches="tight") - print(f"Wrote heatmap to {heatmap_path}") - - -if __name__ == "__main__": - main() diff --git a/sa-schedule-delta-svd.py b/sa-schedule-delta-svd.py deleted file mode 100644 index 3d161c9..0000000 --- a/sa-schedule-delta-svd.py +++ /dev/null @@ -1,234 +0,0 @@ -"""Per-layer residual-stream delta SVD: δ_L = h_{L+1} - h_L stacked -over all tokens in a calibration set. SVD gives us: - - - top singular value per layer → γ_L (scalar magnitude, what Kirkpatrick fit) - - top right-singular-vector per layer → v_L (direction in hidden space) - - effective rank per layer → is this one direction or many? - - pairwise v_L cos-sim across layers → are layers subspace-disjoint or -shared? - -This directly tests the anisotropic-SA hypothesis: - h_{L+1} = h_L + T_shared(h_L) + γ_L · v_L · f(...) - -Phase C prediction: v_L vectors cover broad shared subspace (high mutual cos-sim, -rank-few overall), δ_L is mostly noise around a shared update. -Phase E prediction: v_L vectors are specialized (low pairwise cos-sim, each layer -its own direction), effective rank of the block is close to N. - -Qwen3-32B phases: A 0-6, B 7-9, C 10-31, D 32-46, E 47-58, tail 59-63. -""" -import argparse -import json -import numpy as np -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer - - -CALIB = [ - "The Eiffel Tower is located in", - "Photosynthesis is the process by which", - "The three branches of the US government are the legislative, executive, and", - "If a train travels 60 miles per hour for 2.5 hours, the total distance covered is", - "Solve for x: 3x + 7 = 22. The answer is x =", - "The derivative of x^3 + 2x^2 is", - "def fibonacci(n):\n if n < 2:\n return n\n return", - "# Python list comprehension to square even numbers in 0-9\nresult = ", - "SELECT name, age FROM users WHERE", - "She opened the old wooden box and found", - "The argument in favor of renewable energy is", - "User: What is the capital of Australia?\nAssistant:", - "Write a haiku about autumn:\n", - "Albert Einstein was born in the year", - "The speed of light in vacuum is approximately", - "I really loved that movie because", - "The main difference between a virus and a bacterium is", - "The French word for 'apple' is", - "1 + 1 = ", - "Once upon a time, in a land far away,", - "The key insight of general relativity is that gravity is not a force but", - "Water boils at 100 degrees Celsius at standard atmospheric pressure. At higher", - "In object-oriented programming, encapsulation refers to", - "The mitochondria is often called the powerhouse of the cell because it", - "Shakespeare's Hamlet begins with the famous line", -] - - -def phase_of(L, num_layers): - if num_layers == 64: - if L <= 6: return "A" - if L <= 9: return "B" - if L <= 31: return "C" - if L <= 46: return "D" - if L <= 58: return "E" - return "tail" - frac = L / num_layers - if frac < 0.11: return "A" - if frac < 0.15: return "B" - if frac < 0.5: return "C" - if frac < 0.75: return "D" - if frac < 0.92: return "E" - return "tail" - - -@torch.no_grad() -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("--model", default="Qwen/Qwen3-32B") - ap.add_argument("--out", default="/tmp/delta-svd.json") - ap.add_argument("--top-k", type=int, default=8, - help="keep top-k singular values / directions per layer") - args = ap.parse_args() - - print(f"Loading {args.model} ...", flush=True) - tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained( - args.model, torch_dtype=torch.bfloat16, device_map="cuda", - trust_remote_code=True, attn_implementation="eager", - ).eval() - num_layers = model.config.num_hidden_layers - hidden = model.config.hidden_size - print(f" L={num_layers}, hidden={hidden}", flush=True) - - # Concat calib and tokenize as one stream - text = "\n\n".join(CALIB) - enc = tok(text, return_tensors="pt", truncation=True, max_length=2048).to("cuda") - n_tok = enc.input_ids.shape[1] - print(f" calibration tokens: {n_tok}", flush=True) - - out = model(**enc, output_hidden_states=True, use_cache=False) - # hidden_states: tuple of (num_layers+1) tensors, each (1, n_tok, hidden) - hs = [h[0].float().cpu().numpy() for h in out.hidden_states] - # hs[L] = residual stream entering layer L (or leaving layer L-1). So - # δ_L = hs[L+1] - hs[L] is layer L's contribution. - print(f" hidden_states count: {len(hs)} (expect {num_layers+1})", flush=True) - del model, out - torch.cuda.empty_cache() - - # Per-layer SVD - per_layer = [] - for L in range(num_layers): - delta = hs[L+1] - hs[L] # (n_tok, hidden) - h_in = hs[L] # (n_tok, hidden) - # Remove BOS / first-token artifacts (often outlier) - delta = delta[1:] - h_in = h_in[1:] - n, d = delta.shape - - # Norm per token - token_norms = np.linalg.norm(delta, axis=1) # (n,) - h_norms = np.linalg.norm(h_in, axis=1) # (n,) - # Relative step size: ||δ_L|| / ||h_L|| - rel_step = (token_norms / np.maximum(h_norms, 1e-8)) - # Angle between δ and h, per token: cos = <δ, h> / (||δ||||h||) - dot = np.einsum("nd,nd->n", delta, h_in) - cos_delta_h = dot / np.maximum(token_norms * h_norms, 1e-8) - # "Parallel" component: how much of δ points along ±h - parallel_frac = np.abs(cos_delta_h).mean() - - # SVD in economy mode (on CPU; 2047x5120 fits easy) - U, S, Vt = np.linalg.svd(delta, full_matrices=False) - # S: singular values, decreasing. Vt: right singular vectors (directions). - - # Effective rank (entropy of normalized squared SVs) - p = S**2 / (S**2).sum() - p_nz = p[p > 1e-12] - eff_rank = float(np.exp(-(p_nz * np.log(p_nz)).sum())) - - # Energy concentration - top1_frac = float(p[0]) - top3_frac = float(p[:3].sum()) - top10_frac = float(p[:min(10, len(p))].sum()) - - per_layer.append({ - "L": L, - "phase": phase_of(L, num_layers), - "frob": float(np.linalg.norm(delta)), - "token_norm_mean": float(token_norms.mean()), - "token_norm_std": float(token_norms.std()), - "h_norm_mean": float(h_norms.mean()), - "rel_step_mean": float(rel_step.mean()), - "rel_step_std": float(rel_step.std()), - "parallel_frac": float(parallel_frac), - "cos_delta_h_mean": float(cos_delta_h.mean()), - "top_singvals": S[:args.top_k].tolist(), - "top_dirs": Vt[:args.top_k].astype(np.float32).tolist(), - "eff_rank": eff_rank, - "top1_frac": top1_frac, - "top3_frac": top3_frac, - "top10_frac": top10_frac, - }) - print(f" L={L:>2} phase={phase_of(L, num_layers):>4} " - f"||h||={h_norms.mean():>7.1f} " - f"||δ||={token_norms.mean():>7.2f} " - f"rel={rel_step.mean():.4f} " - f"‖parallel‖={parallel_frac:.4f} " - f"eff_rank={eff_rank:>6.2f}", - flush=True) - - # Pairwise cos-sim of top-1 directions across layers - top1_dirs = np.array([pl["top_dirs"][0] for pl in per_layer]) # (L, d) - top1_cos = top1_dirs @ top1_dirs.T # (L, L) - - # Subspace principal angles: project each layer's top-k into others' span - print(f"\n=== Pairwise top-1 cos-sim (adjacent) ===") - for L in range(num_layers - 1): - print(f" L={L:>2}→{L+1:>2} phase={phase_of(L, num_layers):>4} " - f"|cos|={abs(top1_cos[L, L+1]):>.4f}") - - # Per-phase summary: mean |cos| within phase vs cross-phase - phase_members = {} - for L in range(num_layers): - phase_members.setdefault(phase_of(L, num_layers), []).append(L) - - print(f"\n=== Per-phase top-1 direction overlap ===") - print(f" {'phase':>6} {'N':>3} {'intra_cos_mean':>14} {'cross_cos_mean':>14}") - for ph, Ls in phase_members.items(): - intra = abs(top1_cos[np.ix_(Ls, Ls)]) - if len(Ls) >= 2: - intra_vals = intra[np.triu_indices(len(Ls), k=1)] - intra_mean = float(intra_vals.mean()) - else: - intra_mean = 1.0 - other_Ls = [L for L in range(num_layers) if L not in Ls] - if other_Ls: - cross = abs(top1_cos[np.ix_(Ls, other_Ls)]) - cross_mean = float(cross.mean()) - else: - cross_mean = 0.0 - print(f" {ph:>6} {len(Ls):>3} {intra_mean:>14.4f} {cross_mean:>14.4f}") - - # Subspace overlap: for each phase, find the block's overall principal subspace - # and measure how much of each individual layer sits in it. - print(f"\n=== Block-shared subspace (rank-8) capture fraction per layer ===") - for ph, Ls in phase_members.items(): - if len(Ls) < 2: - continue - # Stack top-k directions from all layers in phase - block_dirs = np.concatenate([per_layer[L]["top_dirs"] for L in Ls], axis=0) - # SVD to get the shared basis of the union - U_b, S_b, Vt_b = np.linalg.svd(block_dirs, full_matrices=False) - shared_basis = Vt_b[:8] # top-8 shared directions of the block's top-k union - # Project each layer's top-1 direction and measure capture - for L in Ls: - v1 = np.array(per_layer[L]["top_dirs"][0]) - capture = float((shared_basis @ v1).__pow__(2).sum()) - print(f" phase={ph:>4} L={L:>2} v1 captured by block top-8: {capture:.4f}") - - # Save - save = { - "model": args.model, - "num_layers": num_layers, - "hidden": hidden, - "n_calib_tokens": int(n_tok), - "per_layer": [ - {k: v for k, v in pl.items() if k != "top_dirs"} # directions too big - for pl in per_layer - ], - "top1_cos_adjacent": [float(top1_cos[L, L+1]) for L in range(num_layers-1)], - } - with open(args.out, "w") as f: - json.dump(save, f, indent=2) - print(f"\nSaved: {args.out}") - - -if __name__ == "__main__": - main() diff --git a/sa-schedule-derive-from-last.py b/sa-schedule-derive-from-last.py deleted file mode 100644 index c1571fa..0000000 --- a/sa-schedule-derive-from-last.py +++ /dev/null @@ -1,214 +0,0 @@ -"""Under the SA-schedule hypothesis, earlier layers should be approximately -a temperature-rescaled version of a shared operator. The simplest test: -pick the last layer's per-head metric spectrum as anchor, and ask whether -earlier layers' spectra are scalar rescales of it. - -Three experiments on the existing per-head singular values: - - (1) Spectral shape invariance. For each head h, normalize σ_L^h by σ_max - and compare the shape vector across layers. If shapes match, scale is - the only free parameter. - - (2) Scalar rescale fit. For each (L, h), find T_L^h minimizing - ||σ_L^h - T_L^h σ_last^h||². Optimal T_L^h = <σ_L^h, σ_last^h>/||σ_last^h||². - Report residual = ||σ_L^h - T_L^h σ_last^h|| / ||σ_L^h||. - - (3) Cross-head sharing. If the *shape* is the same across heads too (not - just across layers), we could use a single anchor per *layer* (last - layer, one head) and reconstruct everything. Report mean shape - correlation across heads within a layer. - -The anchor doesn't have to be the last layer — we also try: last layer, -middle layer, per-layer-group best match. Purpose is not to pick the best -anchor but to understand which choice lets reconstruction succeed. -""" -import argparse -import json -import numpy as np - - -def pad_to(arr, n): - """Pad a 1D array to length n with zeros (for heads of different rank).""" - if arr.shape[0] == n: - return arr - out = np.zeros(n, dtype=arr.dtype) - out[:arr.shape[0]] = arr - return out - - -def collect_spectra(data): - """Return array sigma[L, h, k] of singular values, padded.""" - num_layers = data["num_layers"] - num_heads = data["num_heads"] - # Determine max rank across all heads - max_k = 0 - for row in data["static"]: - for s in row["metric_singvals_per_head"]: - max_k = max(max_k, len(s)) - sigma = np.zeros((num_layers, num_heads, max_k), dtype=np.float64) - for L, row in enumerate(data["static"]): - for h, s in enumerate(row["metric_singvals_per_head"]): - sigma[L, h, :len(s)] = s - return sigma - - -def scalar_rescale_fit(x, y): - """Optimal scalar T s.t. ||x - T y|| is minimized. - Returns (T, residual_frac) where residual_frac = ||x - T y|| / ||x||. - """ - denom = float((y * y).sum()) - if denom < 1e-20: - return 0.0, 1.0 - T = float((x * y).sum() / denom) - resid = x - T * y - rn = float(np.linalg.norm(resid)) - xn = float(np.linalg.norm(x)) - return T, (rn / xn if xn > 1e-20 else 0.0) - - -def cos_sim(x, y): - xn = float(np.linalg.norm(x)) - yn = float(np.linalg.norm(y)) - if xn < 1e-20 or yn < 1e-20: - return 0.0 - return float((x * y).sum() / (xn * yn)) - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("input_json") - ap.add_argument("--anchor", choices=["last", "middle", "best"], default="last") - args = ap.parse_args() - - with open(args.input_json) as f: - data = json.load(f) - - num_layers = data["num_layers"] - num_heads = data["num_heads"] - sigma = collect_spectra(data) # (L, H, K) - print(f"Loaded sigma: shape {sigma.shape}, max rank {sigma.shape[-1]}") - - # ------------------------------------------------------------------ - # Experiment 1: spectral shape invariance across layers (per head) - # ------------------------------------------------------------------ - print("\n=== (1) Spectral shape invariance across layers ===") - # For each head, compute normalized shape σ / σ_max per layer; measure - # mean pairwise cosine similarity of shapes across layers. - shape = np.zeros_like(sigma) - for L in range(num_layers): - for h in range(num_heads): - s = sigma[L, h] - mx = s.max() - shape[L, h] = s / mx if mx > 1e-20 else s - - per_head_cos = np.zeros(num_heads) - for h in range(num_heads): - cs = [] - for L1 in range(num_layers): - for L2 in range(L1 + 1, num_layers): - cs.append(cos_sim(shape[L1, h], shape[L2, h])) - per_head_cos[h] = np.mean(cs) - print(f" per-head mean pairwise cosine of shape across layers:") - print(f" mean {per_head_cos.mean():.4f} std {per_head_cos.std():.4f} " - f"min {per_head_cos.min():.4f} max {per_head_cos.max():.4f}") - # If mean > ~0.99 → shapes identical, pure scalar rescale works - # If mean ~ 0.85-0.95 → close but structure changes layer-to-layer - # If mean < 0.8 → shape varies meaningfully, scalar rescale insufficient - - # ------------------------------------------------------------------ - # Experiment 2: scalar rescale fit to an anchor layer - # ------------------------------------------------------------------ - if args.anchor == "last": - anchor_L = num_layers - 1 - elif args.anchor == "middle": - anchor_L = num_layers // 2 - else: # best: pick layer whose shape is most typical (highest mean cos - # to all other layers) - best_score = -1.0 - anchor_L = num_layers - 1 - for Lc in range(num_layers): - score = 0.0 - for h in range(num_heads): - for L in range(num_layers): - if L == Lc: - continue - score += cos_sim(shape[Lc, h], shape[L, h]) - if score > best_score: - best_score = score - anchor_L = Lc - print(f" [auto-anchor] best layer by total shape-cosine: L={anchor_L}") - - print(f"\n=== (2) Scalar rescale fit to anchor L={anchor_L} ===") - T_map = np.zeros((num_layers, num_heads)) - resid_map = np.zeros((num_layers, num_heads)) - for L in range(num_layers): - for h in range(num_heads): - T, r = scalar_rescale_fit(sigma[L, h], sigma[anchor_L, h]) - T_map[L, h] = T - resid_map[L, h] = r - - # Per-layer residual stats - print(f" per-layer residual fraction ||σ_L^h - T σ_anchor^h|| / ||σ_L^h||:") - print(f" {'L':>3} {'mean resid':>10} {'max resid':>10} {'mean T':>8}") - for L in range(num_layers): - rl = resid_map[L] - tl = T_map[L] - print(f" {L:>3} {rl.mean():>10.4f} {rl.max():>10.4f} {tl.mean():>8.3f}") - - print(f"\n overall mean residual: {resid_map.mean():.4f}") - print(f" overall max residual: {resid_map.max():.4f}") - print(f" frac of (L,h) with resid < 0.10: " - f"{(resid_map < 0.10).mean():.3f}") - print(f" frac of (L,h) with resid < 0.20: " - f"{(resid_map < 0.20).mean():.3f}") - - # ------------------------------------------------------------------ - # Experiment 2b: does T match per-head dynamic entropy? - # ------------------------------------------------------------------ - ent = np.array([row["mean_attention_entropy_per_head"] - for row in data["dynamic"]]) # (L, H) - # T is a scalar temperature of the metric. Geometrically, higher T means - # sharper attention (smaller entropy). So corr(T, entropy) should be negative - # if the scalar rescale captures the temperature schedule. - from numpy import corrcoef - c = float(corrcoef(T_map.flatten(), ent.flatten())[0, 1]) - print(f"\n correlation corr(T_L^h, entropy_L^h) = {c:+.3f} " - f"(negative expected: larger T → sharper → lower entropy)") - - # Also try: does T predict entropy *better* than raw op_norm? (Already had - # op_norm r=+0.45 in geometry analysis.) - op_norm = sigma.max(axis=-1) # (L, H) - c_op = float(corrcoef(op_norm.flatten(), ent.flatten())[0, 1]) - print(f" for comparison, corr(op_norm, entropy) = {c_op:+.3f}") - - # ------------------------------------------------------------------ - # Experiment 3: shape similarity across heads within a layer - # ------------------------------------------------------------------ - print(f"\n=== (3) Cross-head shape similarity within each layer ===") - print(f" {'L':>3} {'mean pair-cos':>14}") - for L in range(num_layers): - cs = [] - for h1 in range(num_heads): - for h2 in range(h1 + 1, num_heads): - cs.append(cos_sim(shape[L, h1], shape[L, h2])) - print(f" {L:>3} {np.mean(cs):>14.4f}") - - # ------------------------------------------------------------------ - # Summary - # ------------------------------------------------------------------ - print("\n=== Summary ===") - print(f" anchor layer: {anchor_L}") - print(f" spectral shape is {'very stable' if per_head_cos.mean() > 0.98 else 'approximately stable' if per_head_cos.mean() > 0.9 else 'not stable'} " - f"across layers (per-head mean pairwise cos = {per_head_cos.mean():.3f})") - print(f" scalar-rescale fit residual: mean {resid_map.mean():.3f}") - if resid_map.mean() < 0.1: - verdict = "HYPOTHESIS SUPPORTED — scalar temperature rescale of a shared operator reconstructs earlier layers to within 10% Frobenius residual." - elif resid_map.mean() < 0.3: - verdict = "PARTIALLY SUPPORTED — scalar rescale captures most of the structure; a low-rank correction on top is likely enough." - else: - verdict = "HYPOTHESIS REJECTED for pure scalar rescale — spectra differ substantially in shape; need full layer-by-layer operators or rank-k delta." - print(f"\n {verdict}") - - -if __name__ == "__main__": - main() diff --git a/sa-schedule-fit-gamma.py b/sa-schedule-fit-gamma.py deleted file mode 100644 index 01f8201..0000000 --- a/sa-schedule-fit-gamma.py +++ /dev/null @@ -1,145 +0,0 @@ -"""Fit a functional form to the LN γ trajectory across layers; derive the -effective attention temperature T(L) from known coupling formulas. - -Rules of what scales with depth (from literature): - DeepNorm: α_dec = (2M)^(1/4), β_dec = (8M)^(-1/4). Same per layer — does - NOT depend on layer index l. The free variation across layers has to - live in LN γ. - Depth-μP: block multiplier a/√L, LR η/√L. Same per layer. - So γ(L) is the family carrying the per-layer schedule. - -Try fitting forms: - γ(L) = a · L^b (power law in layer index) - γ(L) = a · exp(b·L) (exponential) - γ(L) = a + b·L (linear) - γ(L) = a + b·L^c (free c) (power law with free exponent) - -Report fit quality (R², residual statistics), and for the best fit, compute -the derived T(L) curve. -""" -import json -import numpy as np -from math import log, exp - - -def fit_power(L, y): - """y ≈ a · L^b → log y ≈ log a + b log L.""" - mask = (L > 0) & (y > 0) - lx, ly = np.log(L[mask]), np.log(y[mask]) - b, loga = np.polyfit(lx, ly, 1) - yhat = np.exp(loga) * (L**b) - r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum() - return {"form": "a*L^b", "a": float(np.exp(loga)), "b": float(b), "r2": float(r2), "yhat": yhat} - - -def fit_exponential(L, y): - """y ≈ a · exp(b·L) → log y ≈ log a + b·L.""" - mask = y > 0 - b, loga = np.polyfit(L[mask], np.log(y[mask]), 1) - yhat = np.exp(loga) * np.exp(b * L) - r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum() - return {"form": "a*exp(b*L)", "a": float(np.exp(loga)), "b": float(b), "r2": float(r2), "yhat": yhat} - - -def fit_linear(L, y): - b, a = np.polyfit(L, y, 1) - yhat = a + b * L - r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum() - return {"form": "a+b*L", "a": float(a), "b": float(b), "r2": float(r2), "yhat": yhat} - - -def fit_piecewise_two(L, y): - """Best split point L* and linear fits on each half (log-space).""" - best = None - for Ls in range(3, len(L) - 3): - mA, mB = L < Ls, L >= Ls - if (y[mA] <= 0).any() or (y[mB] <= 0).any(): - continue - bA, aA = np.polyfit(L[mA], np.log(y[mA]), 1) - bB, aB = np.polyfit(L[mB], np.log(y[mB]), 1) - yhat = np.where(mA, np.exp(aA + bA * L), np.exp(aB + bB * L)) - r2 = 1 - ((y - yhat)**2).sum() / ((y - y.mean())**2).sum() - if best is None or r2 > best["r2"]: - best = {"form": f"piecewise-exp-split@L={Ls}", "split": int(Ls), - "a1": float(np.exp(aA)), "b1": float(bA), - "a2": float(np.exp(aB)), "b2": float(bB), - "r2": float(r2), "yhat": yhat} - return best - - -def main(): - d = json.load(open("/tmp/qwen3-4b-null.json")) - scales = d["scales"] - num_layers = len(scales["input_ln"]) - L = np.arange(num_layers, dtype=float) - - families_of_interest = ["input_ln", "post_attn_ln", "q_norm", "k_norm", - "q_proj", "k_proj", "v_proj", "o_proj", - "gate_proj", "up_proj", "down_proj"] - - print("=" * 72) - print("γ-trajectory fits per family (Qwen3-4B, 36 layers)") - print("=" * 72) - - for fam in families_of_interest: - y = np.array(scales[fam], dtype=float) - print(f"\n--- {fam} ---") - print(f" L=0: {y[0]:.3f} L=35: {y[-1]:.3f} ratio: {y[-1]/y[0]:+.2f}×") - fits = [ - fit_linear(L, y), - fit_power(L + 1, y), # L+1 so L=0 doesn't explode log - fit_exponential(L, y), - fit_piecewise_two(L + 1, y), - ] - for f in fits: - if f is None: - continue - extras = "" - if "b" in f: - extras = f" (a={f['a']:.3g}, b={f['b']:+.4f})" - elif "split" in f: - extras = f" (split={f['split']}, b1={f['b1']:+.4f}, b2={f['b2']:+.4f})" - print(f" {f['form']:<32} R²={f['r2']:+.4f}{extras}") - - # For input_ln specifically: plot the curve (text) and derive T(L) - y = np.array(scales["input_ln"], dtype=float) - print("\n" + "=" * 72) - print("input_ln γ magnitude across layers (the schedule signal)") - print("=" * 72) - print(f" {'L':>3} {'γ_L':>12} {'γ_L / γ_0':>10} {'log γ_L':>10}") - for l_idx in range(num_layers): - print(f" {l_idx:>3} {y[l_idx]:>12.3f} {y[l_idx]/y[0]:>10.3f} {log(y[l_idx]):>+10.4f}") - - # Classical SA schedules for comparison - # - Linear: T(k) = T0 - k * (T0 - Tf)/N - # - Exponential / Kirkpatrick: T(k) = T0 * α^k - # - Logarithmic / Hajek: T(k) = c / log(k+2) - # For γ (which grows = temperature drops, since larger γ → sharper attention): - # γ growing corresponds to T cooling - print("\n" + "=" * 72) - print("Derived attention-temperature T(L) interpretation") - print("=" * 72) - print(" Attention logit ∝ (γ * W_Q * W_K * ||residual||²) / √d_head.") - print(" With γ_L the schedule dial and other factors ~constant across layers,") - print(" effective attention temperature T(L) ∝ 1/γ(L).") - print(f"\n T(L)/T(0) = γ(0)/γ(L):") - print(f" {'L':>3} {'T(L)/T(0)':>10} (smaller = cooler = sharper attention)") - for l_idx in range(num_layers): - print(f" {l_idx:>3} {y[0]/y[l_idx]:>10.4f}") - - # Comparison with classical SA cooling laws: - # Kirkpatrick: T(L) = T0 · α^L → log T(L) = log T0 + L log α - logT = -np.log(y / y[0]) # because T ∝ 1/γ - b_kirk, a_kirk = np.polyfit(L, logT, 1) - # Hajek (log-cooling): T(L) = c/log(L+2) - # Predicts: log T = log c - log(log(L+2)) - # Fit T(L) to c / log(L+c2) - print(f"\n Kirkpatrick-law fit (exponential cooling):") - print(f" log T(L) = {a_kirk:+.3f} + {b_kirk:+.4f} * L → T(L) = exp({a_kirk:+.3f}) · exp({b_kirk:+.4f}·L)") - logT_hat = a_kirk + b_kirk * L - r2_kirk = 1 - ((logT - logT_hat)**2).sum() / ((logT - logT.mean())**2).sum() - print(f" R² (in log space) = {r2_kirk:+.4f} — ideally ≈ 1 if cooling is pure exponential") - - -if __name__ == "__main__": - main() diff --git a/sa-schedule-gamma-directions.py b/sa-schedule-gamma-directions.py deleted file mode 100644 index 8bb6310..0000000 --- a/sa-schedule-gamma-directions.py +++ /dev/null @@ -1,122 +0,0 @@ -"""Pull input_layernorm.γ vectors from a model and analyze direction -structure across layers. - -Question: is γ just scalar magnitude (isotropic SA) or does each layer -have a preferred direction (anisotropic SA / geometry-aware)? - -Decomposition: γ_L = ||γ_L|| · γ_L̂ - - ||γ_L|| is what our scalar Kirkpatrick fit captured - - γ_L̂ is unit direction — if layers share direction, γ is rank-1 + - scaling (classical isotropic). If directions differ per layer, γ - encodes per-layer preferred axis (anisotropic). - -We also look at: - - pairwise cos-sim between γ_L̂ across layers - - principal components of [γ_L̂]_L (stacked matrix) - - per-phase structure: is Phase E more anisotropic than Phase C? -""" -import argparse -import numpy as np -import torch -from transformers import AutoModelForCausalLM - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("--model", default="Qwen/Qwen3-32B") - ap.add_argument("--out", default="/tmp/gamma-dirs.json") - args = ap.parse_args() - - print(f"Loading {args.model} (CPU, layernorm params only)...", flush=True) - m = AutoModelForCausalLM.from_pretrained( - args.model, torch_dtype=torch.float32, device_map="cpu", - trust_remote_code=True, - ) - num_layers = m.config.num_hidden_layers - hidden = m.config.hidden_size - print(f" L={num_layers}, hidden={hidden}", flush=True) - - gammas = np.stack([ - m.model.layers[L].input_layernorm.weight.detach().float().cpu().numpy() - for L in range(num_layers) - ]) # (L, hidden) - del m - - norms = np.linalg.norm(gammas, axis=1) - units = gammas / norms[:, None] - - # Pairwise cos-sim of unit γ - cos_mat = units @ units.T # (L, L) - - # PCA on unit vectors - centered = units - units.mean(axis=0, keepdims=True) - _, S, Vt = np.linalg.svd(centered, full_matrices=False) - explained = S**2 / (S**2).sum() - - # How much of each γ_L unit is explained by top-1 direction (shared)? - top1 = Vt[0] # (hidden,) - proj_top1 = units @ top1 # (L,) - residual_after_top1 = np.sqrt(np.maximum(1 - proj_top1**2, 0)) - - # Per-phase summary (Qwen3-32B boundaries) - def phase(L): - if L <= 6: return "A" - if L <= 9: return "B" - if L <= 31: return "C" - if L <= 46: return "D" - if L <= 58: return "E" - return "tail" - - phase_ls = {} - for L in range(num_layers): - phase_ls.setdefault(phase(L), []).append(L) - - print(f"\n=== ||γ_L|| per layer (scalar magnitude) ===") - for L in range(num_layers): - print(f" L={L:>2} phase={phase(L):>5} ||γ||={norms[L]:>8.3f} " - f"proj_top1={proj_top1[L]:>+.4f} resid={residual_after_top1[L]:>.4f}") - - print(f"\n=== PCA of unit γ vectors (direction structure) ===") - print(f" Explained variance, top 10 components:") - for i in range(min(10, len(S))): - print(f" PC{i}: {explained[i]:.4f} (singular_val={S[i]:.4f})") - print(f" Top-3 explain: {explained[:3].sum():.4f}") - print(f" Top-10 explain: {explained[:10].sum():.4f}") - - print(f"\n=== Per-phase direction statistics ===") - print(f" {'phase':>6} {'N':>3} {'||γ||_mean':>10} {'||γ||_std':>9} " - f"{'intra_cos':>9} {'vs_other_cos':>12}") - for ph, Ls in phase_ls.items(): - u = units[Ls] - intra = (u @ u.T)[np.triu_indices(len(Ls), k=1)] - intra_mean = intra.mean() if len(intra) > 0 else 1.0 - # Vs other phases - other_Ls = [L for L in range(num_layers) if L not in Ls] - if other_Ls: - u_other = units[other_Ls] - vs = u @ u_other.T - vs_mean = vs.mean() - else: - vs_mean = 0.0 - print(f" {ph:>6} {len(Ls):>3} {norms[Ls].mean():>10.3f} " - f"{norms[Ls].std():>9.3f} {intra_mean:>+9.4f} {vs_mean:>+12.4f}") - - print(f"\n=== Adjacent-pair unit-γ cos-sim ===") - for L in range(num_layers - 1): - print(f" L={L:>2}→{L+1:>2} phase={phase(L):>5} cos={cos_mat[L, L+1]:>+.4f}") - - import json - with open(args.out, "w") as f: - json.dump({ - "model": args.model, - "num_layers": num_layers, - "norms": norms.tolist(), - "proj_top1": proj_top1.tolist(), - "explained_var": explained.tolist(), - "cos_adjacent": [float(cos_mat[L, L+1]) for L in range(num_layers - 1)], - }, f, indent=2) - print(f"\nSaved: {args.out}") - - -if __name__ == "__main__": - main() diff --git a/sa-schedule-geometry-analyze.py b/sa-schedule-geometry-analyze.py deleted file mode 100644 index fb8a18e..0000000 --- a/sa-schedule-geometry-analyze.py +++ /dev/null @@ -1,114 +0,0 @@ -"""What does per-head T (entropy) correlate with geometrically? - -For each (layer, head) we already have singular values of the metric M^h = W_K^h^T W_Q^h -(up to the low-rank structure — strictly SVD of the head_dim x head_dim product). Derive -richer per-head geometric descriptors and test which ones predict dynamic entropy. - -Descriptors per head: - op_norm σ_max — global "capacity for sharpness" - fro_norm √Σ σ_i² — total metric "energy" - rank_eff Σσ / σ_max — effective number of modes - spec_entropy -Σ (σ_i² / Σσ_j²) log(...) — flatness of spectrum (nats) - anisotropy σ_max / σ_mean — how "peaked" the top mode is - condition σ_max / σ_min — ratio of biggest to smallest - trace Σσ_i — sum of modes (L1-like) - -Correlate each of these per-head descriptors against per-head dynamic entropy, across -all (layer, head) pairs. Also stratified by layer-position (early/mid/late). -""" -import argparse -import json -import numpy as np - - -def compute_per_head_geometry(singvals_list): - """singvals_list: list per head of list of singular values. Returns dict of arrays.""" - s_all = [np.array(s, dtype=np.float64) for s in singvals_list] - op = np.array([s.max() for s in s_all]) - fro = np.array([np.sqrt((s ** 2).sum()) for s in s_all]) - trace = np.array([s.sum() for s in s_all]) - rank_eff = np.array([s.sum() / max(s.max(), 1e-12) for s in s_all]) - # Spectral entropy: use normalized σ² as probabilities - spec_ent = np.zeros(len(s_all)) - for i, s in enumerate(s_all): - p = (s ** 2) / max((s ** 2).sum(), 1e-12) - p = np.clip(p, 1e-12, 1.0) - spec_ent[i] = float(-(p * np.log(p)).sum()) - anis = np.array([s.max() / max(s.mean(), 1e-12) for s in s_all]) - cond = np.array([s.max() / max(s.min(), 1e-12) for s in s_all]) - return dict(op=op, fro=fro, trace=trace, rank_eff=rank_eff, - spec_ent=spec_ent, anisotropy=anis, condition=cond) - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("input_json") - args = ap.parse_args() - - with open(args.input_json) as f: - data = json.load(f) - - num_layers = data["num_layers"] - num_heads = data["num_heads"] - - # Entropy per (layer, head) - ent = np.array([row["mean_attention_entropy_per_head"] for row in data["dynamic"]]) # (L, H) - logit_std = np.array([row["mean_logit_std_per_head"] for row in data["dynamic"]]) # (L, H) - - # Geometric descriptors per (layer, head) - geom = {k: np.zeros((num_layers, num_heads)) for k in - ["op", "fro", "trace", "rank_eff", "spec_ent", "anisotropy", "condition"]} - for L, row in enumerate(data["static"]): - per_head = compute_per_head_geometry(row["metric_singvals_per_head"]) - for k, v in per_head.items(): - geom[k][L] = v - - # Flatten across (layer, head) and correlate - print("All (layer, head) pairs — Pearson correlation with dynamic entropy:") - ent_flat = ent.flatten() - logit_flat = logit_std.flatten() - results = {} - for k, v in geom.items(): - v_flat = v.flatten() - c_ent = float(np.corrcoef(v_flat, ent_flat)[0, 1]) - c_logit = float(np.corrcoef(v_flat, logit_flat)[0, 1]) - results[k] = (c_ent, c_logit) - print(f" {k:12} vs entropy: {c_ent:+.3f} vs logit_std: {c_logit:+.3f}") - - # Stratify by layer position — early (0-11), mid (12-23), late (24-35) - thirds = [(0, num_layers // 3, "early"), - (num_layers // 3, 2 * num_layers // 3, "mid"), - (2 * num_layers // 3, num_layers, "late")] - print("\nStratified by layer position (entropy correlation):") - for lo, hi, name in thirds: - print(f" [{name} L{lo}-{hi-1}]", end="") - for k in ["op", "fro", "rank_eff", "spec_ent", "anisotropy", "condition"]: - c = float(np.corrcoef(geom[k][lo:hi].flatten(), ent[lo:hi].flatten())[0, 1]) - print(f" {k}:{c:+.2f}", end="") - print() - - # Best single predictor across all - print("\nBest single geometric predictor of entropy (abs):") - best = max(results.items(), key=lambda kv: abs(kv[1][0])) - print(f" {best[0]} r = {best[1][0]:+.3f}") - - # Multi-regression: try op, spec_ent, rank_eff jointly - print("\nLinear regression of entropy on multiple descriptors (standardized):") - from numpy.linalg import lstsq - X_cols = ["op", "spec_ent", "rank_eff", "anisotropy"] - X = np.stack([geom[k].flatten() for k in X_cols], axis=1) - # standardize - X = (X - X.mean(axis=0)) / (X.std(axis=0) + 1e-12) - y = (ent_flat - ent_flat.mean()) / (ent_flat.std() + 1e-12) - X1 = np.concatenate([X, np.ones((X.shape[0], 1))], axis=1) - coef, res, rk, sv = lstsq(X1, y, rcond=None) - y_pred = X1 @ coef - r2 = 1 - float(((y - y_pred) ** 2).sum() / ((y - y.mean()) ** 2).sum()) - print(f" R² = {r2:.3f}") - print(f" standardized coefficients:") - for name, c in zip(X_cols, coef[:-1]): - print(f" {name:12} {c:+.3f}") - - -if __name__ == "__main__": - main() diff --git a/sa-schedule-layer-variation.py b/sa-schedule-layer-variation.py deleted file mode 100644 index 6ee9dae..0000000 --- a/sa-schedule-layer-variation.py +++ /dev/null @@ -1,238 +0,0 @@ -"""After removing the known gauge freedoms (per-head d_h rotation tying -W_Q/W_K/W_V/W_O together, per-layer d_ff rotation tying gate/up/down), -measure per-family Frobenius distance between consecutive layers within a -middle block. Families with low post-alignment distance are candidates for -"shared operator" across the block; high distance → carries the schedule. - -Normalize each matrix by its Frobenius norm first (so scale differences -don't dominate). We want to see direction of drift, not magnitude. - -Gauge freedoms being removed: - - Per-head d_h rotation R ∈ O(d_h): W_Q^h, W_K^h, W_V^h → R W^h; - W_O^h → W_O^h R^T. Softmax attention is invariant under this. - - Per-layer d_ff rotation S ∈ O(d_ff): gate_proj, up_proj → S W; - down_proj → W S^T. SwiGLU/GLU is NOT fully invariant under d_ff - rotation (because the elementwise gate*up is coordinate-dependent), - so this is an approximate alignment — still better than raw. - -Families that have no gauge freedom (layernorm γ, q_norm, k_norm): compare -directly after scale normalization. -""" -import argparse -import json -import numpy as np -import torch -from transformers import AutoModelForCausalLM - - -def procrustes(M): - """Orthogonal matrix R maximizing tr(R M). Given SVD M = U Σ V^T, R = U V^T.""" - U, _, Vh = np.linalg.svd(M, full_matrices=False) - return U @ Vh - - -def fro(x): - return float(np.linalg.norm(x)) - - -def normalize_fro(x, eps=1e-12): - n = fro(x) - return x / max(n, eps) - - -@torch.no_grad() -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("--model", default="Qwen/Qwen3-4B") - ap.add_argument("--block-start", type=int, default=10) - ap.add_argument("--block-end", type=int, default=25, - help="inclusive; this is mid-block of 36-layer model") - ap.add_argument("--out", default="/tmp/sa-layer-variation.json") - args = ap.parse_args() - - print(f"Loading {args.model} ...", flush=True) - model = AutoModelForCausalLM.from_pretrained( - args.model, - torch_dtype=torch.float32, - device_map="cpu", - trust_remote_code=True, - attn_implementation="eager", - ) - cfg = model.config - num_layers = cfg.num_hidden_layers - num_heads = cfg.num_attention_heads - num_kv_heads = getattr(cfg, "num_key_value_heads", num_heads) - hidden = cfg.hidden_size - head_dim = getattr(cfg, "head_dim", hidden // num_heads) - intermediate = cfg.intermediate_size - print(f" L={num_layers} H={num_heads} kv={num_kv_heads} hd={head_dim} " - f"hidden={hidden} ff={intermediate}", flush=True) - - # Collect per-layer weight matrices as numpy float32. - def get_np(name, idx): - w = getattr(model.model.layers[idx], name, None) - if w is None: - return None - return w - - layers = {} - for L in range(num_layers): - layer = model.model.layers[L] - attn = layer.self_attn - mlp = layer.mlp - layers[L] = { - "q_proj": attn.q_proj.weight.detach().numpy().astype(np.float32), # (nh*hd, hidden) - "k_proj": attn.k_proj.weight.detach().numpy().astype(np.float32), # (nkv*hd, hidden) - "v_proj": attn.v_proj.weight.detach().numpy().astype(np.float32), - "o_proj": attn.o_proj.weight.detach().numpy().astype(np.float32), # (hidden, nh*hd) - "gate_proj": mlp.gate_proj.weight.detach().numpy().astype(np.float32), - "up_proj": mlp.up_proj.weight.detach().numpy().astype(np.float32), - "down_proj": mlp.down_proj.weight.detach().numpy().astype(np.float32), - "input_ln": layer.input_layernorm.weight.detach().numpy().astype(np.float32), - "post_attn_ln": layer.post_attention_layernorm.weight.detach().numpy().astype(np.float32), - } - # Qwen3 has q_norm / k_norm inside self_attn - q_norm = getattr(attn, "q_norm", None) - k_norm = getattr(attn, "k_norm", None) - if q_norm is not None: - layers[L]["q_norm"] = q_norm.weight.detach().numpy().astype(np.float32) - if k_norm is not None: - layers[L]["k_norm"] = k_norm.weight.detach().numpy().astype(np.float32) - - del model # free memory - - block = list(range(args.block_start, args.block_end + 1)) - pairs = [(block[i], block[i + 1]) for i in range(len(block) - 1)] - print(f"\nAnalyzing block layers {args.block_start}..{args.block_end} " - f"({len(pairs)} consecutive pairs)\n") - - # ------------------------------------------------------------------ - # Reshape attention weights per-head for rotation alignment - # ------------------------------------------------------------------ - def per_head_split(W_qkv, n_heads_for_this): - # W is (n*hd, hidden). Reshape to (n, hd, hidden). - return W_qkv.reshape(n_heads_for_this, head_dim, hidden) - - def per_head_split_o(W_o): - # W is (hidden, n*hd). Reshape to (n, hidden, hd). - return W_o.reshape(hidden, num_heads, head_dim).transpose(1, 0, 2) - - # Replicate k/v head index to query head index space (GQA) - def kv_to_q_index(h): - return (h * num_kv_heads) // num_heads - - family_residuals = {fam: [] for fam in - ["q_proj", "k_proj", "v_proj", "o_proj", - "gate_proj", "up_proj", "down_proj", - "input_ln", "post_attn_ln", "q_norm", "k_norm"]} - - for (L1, L2) in pairs: - A = layers[L1] - B = layers[L2] - - # Per-head attention alignment: - Q1 = per_head_split(A["q_proj"], num_heads) - Q2 = per_head_split(B["q_proj"], num_heads) - K1 = per_head_split(A["k_proj"], num_kv_heads) - K2 = per_head_split(B["k_proj"], num_kv_heads) - V1 = per_head_split(A["v_proj"], num_kv_heads) - V2 = per_head_split(B["v_proj"], num_kv_heads) - O1 = per_head_split_o(A["o_proj"]) # (num_heads, hidden, hd) - O2 = per_head_split_o(B["o_proj"]) - - q_res = [] - k_res = [] - v_res = [] - o_res = [] - for h in range(num_heads): - kv_h = kv_to_q_index(h) - # Normalize each matrix by its Frobenius norm - qa = normalize_fro(Q1[h]) - qb = normalize_fro(Q2[h]) - ka = normalize_fro(K1[kv_h]) - kb = normalize_fro(K2[kv_h]) - va = normalize_fro(V1[kv_h]) - vb = normalize_fro(V2[kv_h]) - oa = normalize_fro(O1[h]) - ob = normalize_fro(O2[h]) - - # Cross-correlation for Procrustes: find R (hd × hd) maximizing - # tr(R [Qa Qb^T + Ka Kb^T + Va Vb^T + (Oa^T Ob)]) - # Q, K, V are (hd, hidden); Q2 Q1^T would be (hd, hd); etc. - M = qa @ qb.T + ka @ kb.T + va @ vb.T + (oa.T @ ob) # all (hd, hd) - # Wait: for Q we want tr(R qa qb^T). So the matrix in the max-trace - # Procrustes is qb @ qa.T? Let me be careful. - # max_R tr(R M) achieved at R = U V^T with SVD M = U Σ V^T. - # Here we want R such that R qa ≈ qb → minimize ||R qa - qb||² - # = const - 2 tr(R qa qb^T). So max tr(R qa qb^T) gives the - # correct R. Redo M as sum of qa qb^T terms. - M = qa @ qb.T + ka @ kb.T + va @ vb.T - # For O: want W_O^h R^T ≈ W_O^h_target, i.e. oa R^T ≈ ob - # → min ||oa R^T - ob||² = const - 2 tr(R oa^T ob); max that. - # So O contributes oa^T @ ob to the cross-correlation matrix. - M = M + oa.T @ ob - R = procrustes(M) - - # Apply R and measure residual (Frobenius distance) per-matrix - q_res.append(fro(R @ qa - qb)) - k_res.append(fro(R @ ka - kb)) - v_res.append(fro(R @ va - vb)) - o_res.append(fro(oa @ R.T - ob)) - - family_residuals["q_proj"].append(float(np.mean(q_res))) - family_residuals["k_proj"].append(float(np.mean(k_res))) - family_residuals["v_proj"].append(float(np.mean(v_res))) - family_residuals["o_proj"].append(float(np.mean(o_res))) - - # MLP d_ff rotation alignment: find S (d_ff × d_ff) orthogonal with - # S gate_a ≈ gate_b and S up_a ≈ up_b simultaneously; adjust down_proj. - # Each is (d_ff, hidden). - ga = normalize_fro(A["gate_proj"]) - gb = normalize_fro(B["gate_proj"]) - ua = normalize_fro(A["up_proj"]) - ub = normalize_fro(B["up_proj"]) - da = normalize_fro(A["down_proj"]) # (hidden, d_ff) - db = normalize_fro(B["down_proj"]) - # M_ff = ga @ gb^T + ua @ ub^T + da^T @ db (all d_ff × d_ff) - M_ff = ga @ gb.T + ua @ ub.T + da.T @ db - S = procrustes(M_ff) - family_residuals["gate_proj"].append(fro(S @ ga - gb)) - family_residuals["up_proj"].append(fro(S @ ua - ub)) - family_residuals["down_proj"].append(fro(da @ S.T - db)) - - # LayerNorm γ vectors — no rotation gauge; just scale-normalize and diff - for ln_name in ["input_ln", "post_attn_ln", "q_norm", "k_norm"]: - if ln_name in A and ln_name in B: - va_ = normalize_fro(A[ln_name]) - vb_ = normalize_fro(B[ln_name]) - family_residuals[ln_name].append(fro(va_ - vb_)) - - # Report - print("=== Per-family Frobenius residual between consecutive layers, " - f"block L={args.block_start}..{args.block_end}, after alignment + scale-norm ===\n") - print(f" (Residual = Frobenius distance between L and L+1 after rotation alignment;") - print(f" lower = more shared across block; higher = carries layer-to-layer drift)\n") - print(f" {'family':>14} {'mean':>8} {'min':>8} {'max':>8} {'std':>8} n") - # Report families sorted by mean variation - items = [(fam, np.array(v)) for fam, v in family_residuals.items() if len(v) > 0] - items.sort(key=lambda kv: float(kv[1].mean())) - for fam, v in items: - print(f" {fam:>14} {v.mean():>8.4f} {v.min():>8.4f} {v.max():>8.4f} {v.std():>8.4f} {len(v)}") - - print(f"\n Families ranked least-to-most variation:") - for i, (fam, v) in enumerate(items): - print(f" {i+1}. {fam} (mean residual {v.mean():.4f})") - - # Save - with open(args.out, "w") as f: - json.dump({ - "model": args.model, - "block_start": args.block_start, - "block_end": args.block_end, - "family_residuals": {k: list(v) for k, v in family_residuals.items()}, - }, f, indent=2) - print(f"\nSaved: {args.out}") - - -if __name__ == "__main__": - main() diff --git a/sa-schedule-measure-grams.py b/sa-schedule-measure-grams.py deleted file mode 100644 index 726baf8..0000000 --- a/sa-schedule-measure-grams.py +++ /dev/null @@ -1,168 +0,0 @@ -"""Measure the full inter-layer geometric relationship between per-head metrics. - -For each (L, L', h) pair, compute the Frobenius inner product - = tr(g_L^h^T g_L'^h) -where g^h = W_K^h^T W_Q^h ∈ R^{hidden × hidden} (rank ≤ head_dim). - -Using the head_dim × head_dim shortcut: - = tr(A B^T) with A = W_K_L W_K_L'^T, B = W_Q_L W_Q_L'^T. - -Output: gram[L, L', h] and fro_sq[L, h]. From these every layer-pair comparison -is derivable without saving the full operators. - -Also saves top-k principal directions per head (as right singular vectors of g, -which are the Q-side eigen-directions) so subspace overlap across layers can be -computed downstream. -""" -import argparse -import json -import os -import numpy as np -import torch -from transformers import AutoModelForCausalLM - - -@torch.no_grad() -def measure(model_name: str, out_path: str, topk: int = 8, - dtype=torch.bfloat16): - print(f"Loading {model_name} ...", flush=True) - model = AutoModelForCausalLM.from_pretrained( - model_name, - torch_dtype=dtype, - device_map="cuda", - trust_remote_code=True, - attn_implementation="eager", - ) - model.eval() - cfg = model.config - num_layers = cfg.num_hidden_layers - num_heads = cfg.num_attention_heads - hidden = cfg.hidden_size - head_dim = getattr(cfg, "head_dim", hidden // num_heads) - num_kv_heads = getattr(cfg, "num_key_value_heads", num_heads) - print(f" L={num_layers} H={num_heads} kv={num_kv_heads} hd={head_dim}", flush=True) - - # Collect W_Q, W_K per layer as (num_heads, head_dim, hidden) on GPU float32. - Wq_list = [] - Wk_list = [] - for L, layer in enumerate(model.model.layers): - attn = layer.self_attn - Wq = attn.q_proj.weight.detach().to(torch.float32) # (nh*hd, hidden) - Wk = attn.k_proj.weight.detach().to(torch.float32) # (nkv*hd, hidden) - Wq = Wq.view(num_heads, head_dim, hidden) - # Repeat kv heads so every query head has a matching k-row - Wk = Wk.view(num_kv_heads, head_dim, hidden) - # Broadcast to num_heads via (h // (num_heads // num_kv_heads))? safer: mapping - Wk_full = torch.zeros(num_heads, head_dim, hidden, - device=Wk.device, dtype=Wk.dtype) - for h in range(num_heads): - kv_h = (h * num_kv_heads) // num_heads - Wk_full[h] = Wk[kv_h] - Wq_list.append(Wq) - Wk_list.append(Wk_full) - print(f" loaded weights: {num_layers} layers", flush=True) - - # Per-head top-k right singular vectors of g^h = W_K^T W_Q (hidden, hidden). - # The non-zero right singular vectors of g lie in row-space(W_Q) ⊂ R^hidden. - # For subspace comparison we need vectors in hidden-space. - # - # We also need SIGNED eigenvalues of the symmetric part (g + g^T)/2 to - # determine curvature signs per eigen-direction. Since g has rank ≤ d_h, - # (g + g^T) has rank ≤ 2 d_h, and we can compute its signed non-zero - # eigenvalues via the Jordan-Wielandt-style trick: - # eigs(X^T J X) = eigs(J X X^T) for X = [W_Q; W_K], J = [[0, I], [I, 0]]. - # The resulting 2d_h × 2d_h matrix gives us all non-zero eigenvalues of - # (g + g^T) cheaply. - topk_eff = min(topk, head_dim) - eig_dirs = torch.zeros(num_layers, num_heads, topk_eff, hidden, - dtype=torch.float32) - fro_sq = torch.zeros(num_layers, num_heads, dtype=torch.float64) - sym_eigs = torch.zeros(num_layers, num_heads, 2 * head_dim, - dtype=torch.float64) # signed - for L in range(num_layers): - for h in range(num_heads): - Wq = Wq_list[L][h] # (hd, hidden) - Wk = Wk_list[L][h] # (hd, hidden) - small = Wk @ Wq.T # (hd, hd) - U, S, Vh = torch.linalg.svd(small, full_matrices=False) - dirs = Vh @ Wq # (hd, hidden) - dirs = dirs / dirs.norm(dim=-1, keepdim=True).clamp_min(1e-12) - eig_dirs[L, h] = dirs[:topk_eff].cpu() - fro_sq[L, h] = float((S * S).sum()) - - # Signed eigenvalues of (g + g^T) via 2d_h × 2d_h matrix - # J (X X^T) where X = [Wq; Wk] (stacked) - XXT = torch.zeros(2 * head_dim, 2 * head_dim, - device=Wq.device, dtype=Wq.dtype) - XXT[:head_dim, :head_dim] = Wq @ Wq.T - XXT[:head_dim, head_dim:] = Wq @ Wk.T - XXT[head_dim:, :head_dim] = Wk @ Wq.T - XXT[head_dim:, head_dim:] = Wk @ Wk.T - # J matrix is off-diagonal block identity - J = torch.zeros(2 * head_dim, 2 * head_dim, - device=Wq.device, dtype=Wq.dtype) - J[:head_dim, head_dim:] = torch.eye(head_dim, - device=Wq.device, dtype=Wq.dtype) - J[head_dim:, :head_dim] = torch.eye(head_dim, - device=Wq.device, dtype=Wq.dtype) - M = J @ XXT - # M is not symmetric, but its non-zero eigenvalues are those of - # (g + g^T)/2 times 2 → real (since (g + g^T) is symmetric). - # Use general eigvals; imag parts should be near zero up to - # numerical noise. - ev = torch.linalg.eigvals(M) - ev_real = ev.real.cpu().double() - # sort by magnitude descending so top eigenvalues come first - order = torch.argsort(ev_real.abs(), descending=True) - sym_eigs[L, h] = ev_real[order] - if L % 8 == 0: - print(f" eigdecomp L={L}", flush=True) - - # Gram matrix: gram[L, L', h] = . - # Using A = W_K_L W_K_L'^T, B = W_Q_L W_Q_L'^T, = tr(A B^T) = sum(A * B). - gram = torch.zeros(num_layers, num_layers, num_heads, dtype=torch.float64) - for L in range(num_layers): - for Lp in range(L, num_layers): - for h in range(num_heads): - Wq_L = Wq_list[L][h] - Wk_L = Wk_list[L][h] - Wq_Lp = Wq_list[Lp][h] - Wk_Lp = Wk_list[Lp][h] - A = Wk_L @ Wk_Lp.T # (hd, hd) - B = Wq_L @ Wq_Lp.T # (hd, hd) - v = float((A * B).sum()) - gram[L, Lp, h] = v - gram[Lp, L, h] = v - if L % 4 == 0: - print(f" gram row L={L}", flush=True) - - # Save - out = { - "model": model_name, - "num_layers": num_layers, - "num_heads": num_heads, - "head_dim": head_dim, - "hidden_size": hidden, - "topk": topk_eff, - "gram": gram.tolist(), - "fro_sq": fro_sq.tolist(), - } - with open(out_path, "w") as f: - json.dump(out, f) - torch.save({"eig_dirs": eig_dirs, "sym_eigs": sym_eigs}, - out_path.replace(".json", "-eigdirs.pt")) - print(f"Wrote {out_path} and {out_path.replace('.json', '-eigdirs.pt')}", - flush=True) - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("--model", default="Qwen/Qwen3-4B") - ap.add_argument("--out", default="/tmp/sa-grams.json") - ap.add_argument("--topk", type=int, default=8) - args = ap.parse_args() - measure(args.model, args.out, topk=args.topk) - - -if __name__ == "__main__": - main() diff --git a/sa-schedule-null-residual.py b/sa-schedule-null-residual.py deleted file mode 100644 index 616f0fb..0000000 --- a/sa-schedule-null-residual.py +++ /dev/null @@ -1,237 +0,0 @@ -"""Null test: before any fitting, how similar are adjacent layers in the -raw weight-matrix sense? - -For each adjacent layer pair (L, L+1) and each parameter family: - 1. Normalize each matrix by its Frobenius norm (unit sphere). - 2. Compute cos-sim = / (||W_L|| ||W_{L+1}||). - 3. Compute residual Δ = W_{L+1,norm} - W_{L,norm}; report ||Δ||_F - (null-if-orthogonal = sqrt(2) ≈ 1.414; null-if-identical = 0). - 4. Report effective rank of Δ (via entropy of normalized spectrum). - -Whole network, not just middle block. Plots cos-sim and residual-rank -trajectories across depth. -""" -import argparse -import json -import numpy as np -import torch -from transformers import AutoModelForCausalLM - - -def spec_entropy(singvals, eps=1e-12): - p = (singvals ** 2) - p = p / max(p.sum(), eps) - p = np.clip(p, eps, 1.0) - return float(-(p * np.log(p)).sum()) - - -def frob(x): - return float(np.linalg.norm(x)) - - -def norm_mat(x, eps=1e-12): - return x / max(frob(x), eps) - - -def null_test_pair(A_dict, B_dict, family_names, num_heads, num_kv_heads, head_dim): - """For each family, compute cos-sim and normalized residual between - adjacent layers. Returns dict of per-family stats.""" - out = {} - for fam in family_names: - if fam not in A_dict or fam not in B_dict: - continue - Wa = A_dict[fam] - Wb = B_dict[fam] - if Wa.shape != Wb.shape: - continue - fa = frob(Wa) - fb = frob(Wb) - if fa < 1e-12 or fb < 1e-12: - continue - cos = float((Wa * Wb).sum() / (fa * fb)) - resid_norm_sq = 2.0 - 2.0 * cos # ||Wa/|| - Wb/|| ||^2 - resid_norm = float(np.sqrt(max(resid_norm_sq, 0.0))) - - # Skip residual SVD — was bottleneck on large matrices; cos-sim - # + scalar fit give us the main signal. Can add back selectively. - eff_rank = None - se = None - - out[fam] = { - "cos": cos, - "resid_norm": resid_norm, - "resid_eff_rank": eff_rank, - "resid_spec_entropy": se, - } - return out - - -@torch.no_grad() -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("--model", default="Qwen/Qwen3-4B") - ap.add_argument("--out", default="/tmp/sa-null-residual.json") - args = ap.parse_args() - - print(f"Loading {args.model} ...", flush=True) - model = AutoModelForCausalLM.from_pretrained( - args.model, - torch_dtype=torch.bfloat16, # halve memory vs fp32 - device_map="cpu", - trust_remote_code=True, - attn_implementation="eager", - ) - cfg = model.config - num_layers = cfg.num_hidden_layers - num_heads = cfg.num_attention_heads - num_kv_heads = getattr(cfg, "num_key_value_heads", num_heads) - hidden = cfg.hidden_size - head_dim = getattr(cfg, "head_dim", hidden // num_heads) - intermediate = cfg.intermediate_size - print(f" L={num_layers} H={num_heads} kv={num_kv_heads} hd={head_dim} " - f"hidden={hidden} ff={intermediate}", flush=True) - - families = ["q_proj", "k_proj", "v_proj", "o_proj", - "gate_proj", "up_proj", "down_proj", - "input_ln", "post_attn_ln", "q_norm", "k_norm"] - - layers = {} - for L in range(num_layers): - layer = model.model.layers[L] - attn = layer.self_attn - mlp = layer.mlp - entry = { - "q_proj": attn.q_proj.weight.detach().float().numpy(), - "k_proj": attn.k_proj.weight.detach().float().numpy(), - "v_proj": attn.v_proj.weight.detach().float().numpy(), - "o_proj": attn.o_proj.weight.detach().float().numpy(), - "gate_proj": mlp.gate_proj.weight.detach().float().numpy(), - "up_proj": mlp.up_proj.weight.detach().float().numpy(), - "down_proj": mlp.down_proj.weight.detach().float().numpy(), - "input_ln": layer.input_layernorm.weight.detach().float().numpy(), - "post_attn_ln": layer.post_attention_layernorm.weight.detach().float().numpy(), - } - qn = getattr(attn, "q_norm", None) - kn = getattr(attn, "k_norm", None) - if qn is not None: - entry["q_norm"] = qn.weight.detach().float().numpy() - if kn is not None: - entry["k_norm"] = kn.weight.detach().float().numpy() - layers[L] = entry - - del model - - # Also record per-layer scale (Frobenius norm) for the scale-track PCA - scales = {fam: [] for fam in families} - for L in range(num_layers): - for fam in families: - if fam in layers[L]: - scales[fam].append(frob(layers[L][fam])) - else: - scales[fam].append(None) - - # Pairwise null test - pair_results = [] - for L in range(num_layers - 1): - r = null_test_pair(layers[L], layers[L + 1], families, - num_heads, num_kv_heads, head_dim) - pair_results.append({"L": L, "L_next": L + 1, "families": r}) - - # Report - print("\n=== Adjacent-layer raw cos-sim per family ===") - print(" null interpretation: 1.0 = identical matrices up to scale, 0 = orthogonal") - print(f"\n {'L':>3}", end="") - for fam in families: - if any(fam in pr["families"] for pr in pair_results): - print(f" {fam:>12}", end="") - print() - for pr in pair_results: - print(f" {pr['L']:>3}", end="") - for fam in families: - if fam in pr["families"]: - print(f" {pr['families'][fam]['cos']:>+12.4f}", end="") - else: - print(f" {'':>12}", end="") - print() - - # Summary per family + scalar-T fit comparison - # raw_resid = sqrt(2 - 2*cos); scalar_fit = sqrt(1 - cos²) = sin(angle). - # random_baseline = sqrt(2) ≈ 1.414. - print("\n=== Per-family summary (across all adjacent pairs) ===") - print(" random baseline = sqrt(2) ≈ 1.414 (what we'd see with no relationship)") - print(f"\n {'family':>14} {'mean_cos':>10} {'median_cos':>11} " - f"{'raw_resid':>10} {'scalar_fit':>11} {'improve_frac':>13} {'mean_SE':>8}") - for fam in families: - cs = [pr["families"].get(fam, {}).get("cos") for pr in pair_results] - cs = [x for x in cs if x is not None] - rs = [pr["families"].get(fam, {}).get("resid_norm") for pr in pair_results] - rs = [x for x in rs if x is not None] - ers = [pr["families"].get(fam, {}).get("resid_eff_rank") for pr in pair_results] - ers = [x for x in ers if x is not None] - ses = [pr["families"].get(fam, {}).get("resid_spec_entropy") for pr in pair_results] - ses = [x for x in ses if x is not None] - if not cs: - continue - raw = np.sqrt(np.maximum(2.0 - 2.0 * np.array(cs), 0.0)).mean() - scalar_fit = np.sqrt(np.maximum(1.0 - np.array(cs) ** 2, 0.0)).mean() - # Improvement fraction: (raw - scalar_fit) / (raw - 0) normalized - # to [0, 1] where 0 = scalar does nothing, 1 = scalar reconstructs. - improve_frac = (raw - scalar_fit) / max(raw, 1e-12) - print(f" {fam:>14} {np.mean(cs):>+10.4f} {np.median(cs):>+11.4f} " - f"{raw:>10.4f} {scalar_fit:>11.4f} {improve_frac:>13.4f} " - f"{np.mean(ses) if ses else 0:>8.4f}") - - # Scale-track: Frobenius norm of each family across layers - print("\n=== Scale track: ||W_family||_F across layers ===") - print(f" {'L':>3}", end="") - for fam in families: - if any(s is not None for s in scales[fam]): - print(f" {fam:>12}", end="") - print() - for L in range(num_layers): - print(f" {L:>3}", end="") - for fam in families: - if scales[fam][L] is not None: - print(f" {scales[fam][L]:>12.4f}", end="") - else: - print(f" {'':>12}", end="") - print() - - # PCA of log-scale-track to see dimensionality of schedule - print("\n=== PCA of log-scale-track (dimensionality of schedule) ===") - scale_matrix = [] - fam_used = [] - for fam in families: - vals = scales[fam] - if all(v is not None for v in vals): - scale_matrix.append(np.log(np.array(vals))) - fam_used.append(fam) - scale_matrix = np.array(scale_matrix) # (num_families, L) - # Center per-family - sm_c = scale_matrix - scale_matrix.mean(axis=1, keepdims=True) - # SVD: columns are layers, rows are families - U, S, Vh = np.linalg.svd(sm_c, full_matrices=False) - total = (S ** 2).sum() - print(f" explained variance by mode:") - for i, s in enumerate(S): - pct = float(s ** 2 / max(total, 1e-20)) * 100 - print(f" mode {i+1:>2}: {pct:>6.2f}% " - f"(loadings per family: " - f"{', '.join(f'{fam_used[j]}={U[j, i]:+.2f}' for j in range(len(fam_used)))})") - - # Save - with open(args.out, "w") as f: - json.dump({ - "model": args.model, - "pair_results": pair_results, - "scales": scales, - "scale_pca_singvals": S.tolist(), - "scale_pca_loadings": U.tolist(), - "scale_pca_scores": (np.diag(S) @ Vh).tolist(), - "fam_used": fam_used, - }, f, indent=2) - print(f"\nSaved: {args.out}") - - -if __name__ == "__main__": - main() diff --git a/sa-schedule-readout-measure.py b/sa-schedule-readout-measure.py deleted file mode 100644 index 5306fb6..0000000 --- a/sa-schedule-readout-measure.py +++ /dev/null @@ -1,246 +0,0 @@ -""" -SA schedule readout for a dense softmax-attention LLM (Qwen3-8B by default). - -Measures per-layer "temperature" signals: - - entropy of softmax attention (per head, aggregated) - - magnitude of pre-softmax logits (implicit sharpness) - - spectrum of the parameter metric g_L^h = W_K^h^T W_Q^h (static, no forward pass needed) - -Output: - stats.json — numeric summary per layer / head - activations stats by layer accumulated across a calibration set - -Goal: - Compare entropy(L) (dynamic readout) against static spectrum of g_L (parameter-only - prediction). Agreement => schedule is parameter-intrinsic and a scalar per-iteration - T suffices. Disagreement => content-adaptive structure lives in the activations. -""" -import argparse -import json -import os -import math -import torch -import torch.nn.functional as F -from transformers import AutoModelForCausalLM, AutoTokenizer - - -CALIBRATION_PROMPTS = [ - # general knowledge - "The Eiffel Tower is located in", - "Photosynthesis is the process by which", - "The three branches of the US government are", - # math / reasoning - "If a train travels 60 miles per hour for 2.5 hours, the total distance covered is", - "Solve for x: 3x + 7 = 22. The answer is x =", - "The derivative of x^3 + 2x^2 is", - # code - "def fibonacci(n):\n if n < 2:\n return n\n return", - "# Python list comprehension to square even numbers in 0-9\nresult = ", - "SELECT name, age FROM users WHERE", - # narrative / long-form - "She opened the old wooden box and found", - "The argument in favor of renewable energy is", - # chat / instruction - "User: What is the capital of Australia?\nAssistant:", - "Write a haiku about autumn:\n", - # factual / lookup - "Albert Einstein was born in the year", - "The speed of light in vacuum is approximately", - # conversational - "I really loved that movie because", - "The main difference between a virus and a bacterium is", - # translation-ish - "The French word for 'apple' is", - # edge cases - "1 + 1 = ", - "Once upon a time, in a land far away,", -] - - -@torch.no_grad() -def measure_model(model_name: str, out_path: str, max_seq_len: int = 256, dtype=torch.bfloat16): - print(f"Loading {model_name} ...", flush=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained( - model_name, - torch_dtype=dtype, - device_map="cuda", - trust_remote_code=True, - attn_implementation="eager", # need raw attention probabilities - ) - model.eval() - - cfg = model.config - num_layers = cfg.num_hidden_layers - num_heads = cfg.num_attention_heads - hidden = cfg.hidden_size - head_dim = getattr(cfg, "head_dim", hidden // num_heads) - num_kv_heads = getattr(cfg, "num_key_value_heads", num_heads) - print(f" num_hidden_layers={num_layers} num_attention_heads={num_heads} " - f"num_kv_heads={num_kv_heads} head_dim={head_dim} hidden_size={hidden}", - flush=True) - - # ---- Static (parameter-only) readout ---- - # Per layer, per head h, compute the metric g^h = W_K^h^T W_Q^h (shape head_dim x head_dim) - # and record its singular spectrum. Metric norm is our "static temperature" prediction. - # With grouped-query attention, each query head shares a KV head; we compute metric per - # query head using the shared KV head. - static_stats = [] - for L, layer in enumerate(model.model.layers): - attn = layer.self_attn - W_Q = attn.q_proj.weight.detach().float().cpu() # (num_heads*head_dim, hidden) - W_K = attn.k_proj.weight.detach().float().cpu() # (num_kv_heads*head_dim, hidden) - - per_head_metric_fro = [] - per_head_metric_op = [] - per_head_metric_singvals = [] - for h in range(num_heads): - kv_h = (h * num_kv_heads) // num_heads - wq_h = W_Q[h * head_dim:(h + 1) * head_dim] # (head_dim, hidden) - wk_h = W_K[kv_h * head_dim:(kv_h + 1) * head_dim] # (head_dim, hidden) - # metric on hidden space: M = W_K^h^T W_Q^h shape (hidden, hidden). - # But we only need its non-zero spectrum; equivalently SVD of wk_h^T @ wq_h, - # or simpler: singular values of (wk_h @ wq_h.T) which is head_dim x head_dim. - small = wk_h @ wq_h.T # (head_dim, head_dim) - s = torch.linalg.svdvals(small) # (head_dim,) - per_head_metric_fro.append(float(s.pow(2).sum().sqrt())) - per_head_metric_op.append(float(s.max())) - per_head_metric_singvals.append(s.tolist()) - static_stats.append({ - "layer": L, - "metric_fro_per_head": per_head_metric_fro, - "metric_op_per_head": per_head_metric_op, - "metric_singvals_per_head": per_head_metric_singvals, - }) - if L % 8 == 0: - print(f" static layer {L}: mean op-norm over heads = " - f"{sum(per_head_metric_op)/len(per_head_metric_op):.3f}", - flush=True) - - # ---- Dynamic (activation) readout ---- - # Hook each attention layer with output_attentions. Per layer, per head, accumulate - # sum of attention entropy and sum of pre-softmax logit magnitude across the calibration set. - acc_entropy = torch.zeros(num_layers, num_heads, dtype=torch.float64) - acc_logit_mag = torch.zeros(num_layers, num_heads, dtype=torch.float64) - acc_logit_var = torch.zeros(num_layers, num_heads, dtype=torch.float64) - acc_n_positions = torch.zeros(num_layers, dtype=torch.float64) - - # The simplest path: run with output_attentions=True; eager impl returns attn probs. - # We cannot get pre-softmax logits from the HF API directly; extract them manually - # via a forward-pre-hook that snapshots Q and K, compute Q@K^T / sqrt(head_dim), and - # compare against attention_mask (we care about unmasked positions only). - - captured = {} - - def make_hook(layer_idx): - def hook(module, inp, out): - # eager attention returns (attn_output, attn_weights, past_key_value) - # attn_weights has shape (bsz, num_heads, q_len, k_len) - if isinstance(out, tuple) and len(out) >= 2 and out[1] is not None: - captured[layer_idx] = out[1].detach() - else: - captured[layer_idx] = None - return hook - - hooks = [] - for L, layer in enumerate(model.model.layers): - h = layer.self_attn.register_forward_hook(make_hook(L)) - hooks.append(h) - - for i, prompt in enumerate(CALIBRATION_PROMPTS): - inp = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_seq_len).to("cuda") - captured.clear() - _ = model(**inp, output_attentions=True, use_cache=False) - seq_len = inp["input_ids"].shape[1] - - for L in range(num_layers): - aw = captured.get(L, None) - if aw is None: - continue - # aw: (1, num_heads, q_len, k_len), softmax over last dim with causal mask - # entropy: -sum p log p over last dim. Positions with fewer valid keys have - # naturally lower max entropy; we average over positions anyway. - p = aw.float().squeeze(0) # (num_heads, q_len, k_len) - eps = 1e-12 - ent = -(p * (p + eps).log()).sum(dim=-1) # (num_heads, q_len) - acc_entropy[L] += ent.mean(dim=-1).cpu().double() - - # Back out the logits. For causal softmax, logit_ij = log p_ij + c(i) for some - # row constant c(i); we can recover up to row constant by log p (masking zeros). - # To get a usable logit magnitude, we take the (unmasked) per-row std. - logp = (p + eps).log() # (num_heads, q_len, k_len) - # mask invalid keys (p==0 means masked) - valid = (p > 0).float() - denom = valid.sum(dim=-1).clamp_min(1) - mean_logp = (logp * valid).sum(dim=-1) / denom - centered = (logp - mean_logp.unsqueeze(-1)) * valid - var_logp = (centered.pow(2).sum(dim=-1) / denom) - # per-row std of logits is a direct readout of logit magnitude (== sharpness) - row_std = var_logp.clamp_min(0).sqrt() # (num_heads, q_len) - acc_logit_mag[L] += row_std.mean(dim=-1).cpu().double() - acc_logit_var[L] += var_logp.mean(dim=-1).cpu().double() - - acc_n_positions += 1 # once per prompt - - if i % 5 == 0: - print(f" prompt {i+1}/{len(CALIBRATION_PROMPTS)} len={seq_len}", flush=True) - - for h in hooks: - h.remove() - - # Normalize by number of prompts (all contributed 1 sample per layer/head) - n = max(len(CALIBRATION_PROMPTS), 1) - mean_entropy = (acc_entropy / n).tolist() - mean_logit_mag = (acc_logit_mag / n).tolist() - mean_logit_var = (acc_logit_var / n).tolist() - - # Assemble output - dynamic_stats = [] - for L in range(num_layers): - dynamic_stats.append({ - "layer": L, - "mean_attention_entropy_per_head": mean_entropy[L], - "mean_logit_std_per_head": mean_logit_mag[L], - "mean_logit_var_per_head": mean_logit_var[L], - "mean_attention_entropy": sum(mean_entropy[L]) / num_heads, - "mean_logit_std": sum(mean_logit_mag[L]) / num_heads, - }) - - output = { - "model": model_name, - "num_layers": num_layers, - "num_heads": num_heads, - "num_kv_heads": num_kv_heads, - "head_dim": head_dim, - "hidden_size": hidden, - "n_prompts": len(CALIBRATION_PROMPTS), - "static": static_stats, - "dynamic": dynamic_stats, - } - - with open(out_path, "w") as f: - json.dump(output, f, indent=2) - print(f"\nWrote {out_path}", flush=True) - - # Quick summary to console - print("\nPer-layer schedule readout (averaged over heads):") - print(f" {'L':>3} {'mean_entropy':>14} {'mean_logit_std':>16} {'mean_metric_op':>16}") - for L in range(num_layers): - mean_op = sum(static_stats[L]["metric_op_per_head"]) / num_heads - print(f" {L:>3} " - f"{dynamic_stats[L]['mean_attention_entropy']:>14.4f} " - f"{dynamic_stats[L]['mean_logit_std']:>16.4f} " - f"{mean_op:>16.4f}") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--model", default="Qwen/Qwen3-8B") - parser.add_argument("--out", default="/tmp/sa-schedule-readout.json") - parser.add_argument("--max-seq-len", type=int, default=256) - args = parser.parse_args() - measure_model(args.model, args.out, max_seq_len=args.max_seq_len) - - -if __name__ == "__main__": - main() diff --git a/sa-schedule-topblock-swap.py b/sa-schedule-topblock-swap.py deleted file mode 100644 index ec582d8..0000000 --- a/sa-schedule-topblock-swap.py +++ /dev/null @@ -1,498 +0,0 @@ -"""Top-block replacement experiment: test SA-schedule hypothesis by -replacing the last 8 layers of Qwen3-4B with variants that progressively -strip out the learned schedule / specialization. - -Variants: - baseline — unmodified reference (PPL sanity check) - schedule_fit — replace input_ln.γ magnitude in top block with - fitted Kirkpatrick γ(L) = 3.53·exp(0.119·L). Directions - preserved, projection weights untouched. - single_op — use layer 35's projection weights for ALL top-block - layers (strip specialization), combined with the fitted - schedule γ(L). Tests if per-layer specialization in top - block is load-bearing or replaceable by schedule. - uniform_gamma — set all top-block input_ln.γ magnitudes to the middle - layer's value (no schedule at all in top block). Tests - necessity of schedule itself. - -Eval: perplexity on a concatenation of calibration prompts + a short -excerpt. Also generation quality on a handful of diagnostic prompts. -""" -import argparse -import math -import os -import torch -import torch.nn.functional as F -from transformers import AutoModelForCausalLM, AutoTokenizer - - -# From sa-schedule-fit-gamma.py on Qwen3-4B null-residual data: -# input_ln.γ magnitude ≈ 3.53 · exp(0.119 · L), R² = 0.95 -# Defaults for 4B. Override via env SCHEDULE_A / SCHEDULE_B for other models. -# 32B fit: a=1.02, b=0.0873 -SCHEDULE_A = float(os.environ.get("SCHEDULE_A", "3.53")) if "SCHEDULE_A" in os.environ else 3.53 -SCHEDULE_B = float(os.environ.get("SCHEDULE_B", "0.1191")) if "SCHEDULE_B" in os.environ else 0.1191 - -BLOCK_START = int(os.environ.get("BLOCK_START", 28)) -BLOCK_END = int(os.environ.get("BLOCK_END", 35)) -# Optional: comma-separated "s1-e1,s2-e2,..." blocks for multi-block merge -BLOCKS_ENV = os.environ.get("BLOCKS", "") -if BLOCKS_ENV: - BLOCKS = [tuple(int(x) for x in p.split("-")) for p in BLOCKS_ENV.split(",")] -else: - BLOCKS = [(BLOCK_START, BLOCK_END)] - -CALIB = [ - "The Eiffel Tower is located in", - "Photosynthesis is the process by which", - "The three branches of the US government are the legislative, executive, and", - "If a train travels 60 miles per hour for 2.5 hours, the total distance covered is", - "Solve for x: 3x + 7 = 22. The answer is x =", - "The derivative of x^3 + 2x^2 is", - "def fibonacci(n):\n if n < 2:\n return n\n return", - "# Python list comprehension to square even numbers in 0-9\nresult = ", - "SELECT name, age FROM users WHERE", - "She opened the old wooden box and found", - "The argument in favor of renewable energy is", - "User: What is the capital of Australia?\nAssistant:", - "Write a haiku about autumn:\n", - "Albert Einstein was born in the year", - "The speed of light in vacuum is approximately", - "I really loved that movie because", - "The main difference between a virus and a bacterium is", - "The French word for 'apple' is", - "1 + 1 = ", - "Once upon a time, in a land far away,", - "The key insight of general relativity is that gravity is not a force but", - "Water boils at 100 degrees Celsius at standard atmospheric pressure. At higher", - "In object-oriented programming, encapsulation refers to", - "The mitochondria is often called the powerhouse of the cell because it", - "Shakespeare's Hamlet begins with the famous line", -] - -GEN_PROMPTS = [ - "The capital of France is", - "2 + 2 =", - "def reverse_string(s):\n return", - "Albert Einstein developed the theory of", -] - - -def load_model(name=None): - if name is None: - name = os.environ.get("MODEL", "Qwen/Qwen3-4B") - print(f"Loading {name}...", flush=True) - tok = AutoTokenizer.from_pretrained(name, trust_remote_code=True) - m = AutoModelForCausalLM.from_pretrained( - name, torch_dtype=torch.bfloat16, device_map="cuda", - trust_remote_code=True, attn_implementation="eager", - ) - m.eval() - return m, tok - - -def _merge_block(model, block_start, block_end): - """Arithmetic-mean merge projections in [block_start, block_end]; set γ per schedule.""" - layers = [model.model.layers[L] for L in range(block_start, block_end + 1)] - param_names = [ - ("self_attn.q_proj.weight", lambda l: l.self_attn.q_proj.weight), - ("self_attn.k_proj.weight", lambda l: l.self_attn.k_proj.weight), - ("self_attn.v_proj.weight", lambda l: l.self_attn.v_proj.weight), - ("self_attn.o_proj.weight", lambda l: l.self_attn.o_proj.weight), - ("mlp.gate_proj.weight", lambda l: l.mlp.gate_proj.weight), - ("mlp.up_proj.weight", lambda l: l.mlp.up_proj.weight), - ("mlp.down_proj.weight", lambda l: l.mlp.down_proj.weight), - ] - merged = {} - for name, getter in param_names: - stack = torch.stack([getter(l).data.float() for l in layers], dim=0) - merged[name] = stack.mean(dim=0).to(getter(layers[0]).data.dtype) - for l in layers: - l.self_attn.q_proj.weight.data.copy_(merged["self_attn.q_proj.weight"]) - l.self_attn.k_proj.weight.data.copy_(merged["self_attn.k_proj.weight"]) - l.self_attn.v_proj.weight.data.copy_(merged["self_attn.v_proj.weight"]) - l.self_attn.o_proj.weight.data.copy_(merged["self_attn.o_proj.weight"]) - l.mlp.gate_proj.weight.data.copy_(merged["mlp.gate_proj.weight"]) - l.mlp.up_proj.weight.data.copy_(merged["mlp.up_proj.weight"]) - l.mlp.down_proj.weight.data.copy_(merged["mlp.down_proj.weight"]) - for L in range(block_start, block_end + 1): - predicted = SCHEDULE_A * math.exp(SCHEDULE_B * L) - gamma = model.model.layers[L].input_layernorm.weight.data - gamma.mul_(predicted / gamma.norm().item()) - - -def _procrustes(M): - """Orthogonal R = U V^T maximizing tr(R M) where M = U Σ V^T.""" - U, _, Vh = torch.linalg.svd(M.float(), full_matrices=False) - return U @ Vh - - -def _aligned_merge_block(model, block_start, block_end, align_ff=False): - """Procrustes-align per-head d_h basis (and optionally d_ff) of each - layer in [block_start, block_end] to a reference (middle), then - arithmetic-mean. Attention rotation is a true gauge; FF rotation is - not (SiLU breaks it) — align_ff defaults off.""" - cfg = model.config - num_heads = cfg.num_attention_heads - num_kv = getattr(cfg, "num_key_value_heads", num_heads) - hidden = cfg.hidden_size - d_h = getattr(cfg, "head_dim", hidden // num_heads) - - ref_L = (block_start + block_end) // 2 - ref = model.model.layers[ref_L] - dev = ref.self_attn.q_proj.weight.device - dtype = ref.self_attn.q_proj.weight.dtype - - # Reference views, fp32 on device - Qr = ref.self_attn.q_proj.weight.data.float().reshape(num_heads, d_h, hidden) - Kr = ref.self_attn.k_proj.weight.data.float().reshape(num_kv, d_h, hidden) - Vr = ref.self_attn.v_proj.weight.data.float().reshape(num_kv, d_h, hidden) - Or = ref.self_attn.o_proj.weight.data.float().reshape(hidden, num_heads, d_h).permute(1, 0, 2).contiguous() - - if align_ff: - d_ff = cfg.intermediate_size - Gr = ref.mlp.gate_proj.weight.data.float() - Ur = ref.mlp.up_proj.weight.data.float() - Dr = ref.mlp.down_proj.weight.data.float() - - rotated = [] - for L in range(block_start, block_end + 1): - layer = model.model.layers[L] - Q = layer.self_attn.q_proj.weight.data.float().reshape(num_heads, d_h, hidden) - K = layer.self_attn.k_proj.weight.data.float().reshape(num_kv, d_h, hidden) - V = layer.self_attn.v_proj.weight.data.float().reshape(num_kv, d_h, hidden) - O = layer.self_attn.o_proj.weight.data.float().reshape(hidden, num_heads, d_h).permute(1, 0, 2).contiguous() - - if L == ref_L: - Q_new, K_new, V_new, O_new = Q.clone(), K.clone(), V.clone(), O.clone() - else: - Q_new = torch.empty_like(Q) - K_new = torch.empty_like(K) - V_new = torch.empty_like(V) - O_new = torch.empty_like(O) - for h in range(num_heads): - kv_h = (h * num_kv) // num_heads - # Cross-correlation: want R s.t. R @ Q ≈ Qr (row-space align). - # For per-head (d_h, hidden): M = Qr @ Q.T + Kr @ K.T + Vr @ V.T + Or^T @ O - # (Or, O are (hidden, d_h) per head) - M = (Qr[h] @ Q[h].T - + Kr[kv_h] @ K[kv_h].T - + Vr[kv_h] @ V[kv_h].T - + Or[h].T @ O[h]) - R = _procrustes(M) - Q_new[h] = R @ Q[h] - K_new[kv_h] = R @ K[kv_h] - V_new[kv_h] = R @ V[kv_h] - O_new[h] = O[h] @ R.T - - rotated.append({ - "q": Q_new.reshape(num_heads * d_h, hidden), - "k": K_new.reshape(num_kv * d_h, hidden), - "v": V_new.reshape(num_kv * d_h, hidden), - "o": O_new.permute(1, 0, 2).reshape(hidden, num_heads * d_h), - }) - - # Average rotated attention - q_avg = torch.stack([r["q"] for r in rotated]).mean(0).to(dtype) - k_avg = torch.stack([r["k"] for r in rotated]).mean(0).to(dtype) - v_avg = torch.stack([r["v"] for r in rotated]).mean(0).to(dtype) - o_avg = torch.stack([r["o"] for r in rotated]).mean(0).to(dtype) - - # FF: naive mean (rotation gauge is fake through SiLU) - layers = [model.model.layers[L] for L in range(block_start, block_end + 1)] - gate_avg = torch.stack([l.mlp.gate_proj.weight.data.float() for l in layers]).mean(0).to(dtype) - up_avg = torch.stack([l.mlp.up_proj.weight.data.float() for l in layers]).mean(0).to(dtype) - down_avg = torch.stack([l.mlp.down_proj.weight.data.float() for l in layers]).mean(0).to(dtype) - - # q_norm/k_norm γ: copy from reference (they're basis-dependent; no clean average in rotated frame) - ref_qn = ref.self_attn.q_norm.weight.data.clone() if getattr(ref.self_attn, "q_norm", None) is not None else None - ref_kn = ref.self_attn.k_norm.weight.data.clone() if getattr(ref.self_attn, "k_norm", None) is not None else None - - for l in layers: - l.self_attn.q_proj.weight.data.copy_(q_avg) - l.self_attn.k_proj.weight.data.copy_(k_avg) - l.self_attn.v_proj.weight.data.copy_(v_avg) - l.self_attn.o_proj.weight.data.copy_(o_avg) - l.mlp.gate_proj.weight.data.copy_(gate_avg) - l.mlp.up_proj.weight.data.copy_(up_avg) - l.mlp.down_proj.weight.data.copy_(down_avg) - if ref_qn is not None and getattr(l.self_attn, "q_norm", None) is not None: - l.self_attn.q_norm.weight.data.copy_(ref_qn) - if ref_kn is not None and getattr(l.self_attn, "k_norm", None) is not None: - l.self_attn.k_norm.weight.data.copy_(ref_kn) - - for L in range(block_start, block_end + 1): - predicted = SCHEDULE_A * math.exp(SCHEDULE_B * L) - gamma = model.model.layers[L].input_layernorm.weight.data - gamma.mul_(predicted / gamma.norm().item()) - - -def apply_variant(model, variant): - """Modify model in place according to variant.""" - if variant == "baseline": - return - - if variant == "schedule_fit": - for L in range(BLOCK_START, BLOCK_END + 1): - predicted = SCHEDULE_A * math.exp(SCHEDULE_B * L) - layer = model.model.layers[L] - gamma = layer.input_layernorm.weight.data - cur_norm = gamma.norm().item() - # Preserve direction, scale to predicted magnitude - gamma.mul_(predicted / cur_norm) - - elif variant == "single_op": - # Use middle-of-block as reference, not end (more representative) - ref_L = (BLOCK_START + BLOCK_END) // 2 - ref = model.model.layers[ref_L] - for L in range(BLOCK_START, BLOCK_END + 1): - if L == ref_L: - continue - tgt = model.model.layers[L] - tgt.self_attn.q_proj.weight.data.copy_(ref.self_attn.q_proj.weight.data) - tgt.self_attn.k_proj.weight.data.copy_(ref.self_attn.k_proj.weight.data) - tgt.self_attn.v_proj.weight.data.copy_(ref.self_attn.v_proj.weight.data) - tgt.self_attn.o_proj.weight.data.copy_(ref.self_attn.o_proj.weight.data) - tgt.mlp.gate_proj.weight.data.copy_(ref.mlp.gate_proj.weight.data) - tgt.mlp.up_proj.weight.data.copy_(ref.mlp.up_proj.weight.data) - tgt.mlp.down_proj.weight.data.copy_(ref.mlp.down_proj.weight.data) - # q_norm, k_norm: copy too - if hasattr(tgt.self_attn, "q_norm") and tgt.self_attn.q_norm is not None: - tgt.self_attn.q_norm.weight.data.copy_(ref.self_attn.q_norm.weight.data) - if hasattr(tgt.self_attn, "k_norm") and tgt.self_attn.k_norm is not None: - tgt.self_attn.k_norm.weight.data.copy_(ref.self_attn.k_norm.weight.data) - # Keep each layer's OWN input_ln.γ direction but set magnitude to schedule - predicted = SCHEDULE_A * math.exp(SCHEDULE_B * L) - gamma = tgt.input_layernorm.weight.data - gamma.mul_(predicted / gamma.norm().item()) - # post_attn_ln γ: leave as-is for now (could also fit & set) - - elif variant == "ties_op": - # TIES-Merging (Yadav et al. 2023): trim, elect-sign, disjoint merge. - # Operates per parameter family across the N block layers. - density = float(os.environ.get("TIES_DENSITY", "0.2")) - layers = [model.model.layers[L] for L in range(BLOCK_START, BLOCK_END + 1)] - param_names = [ - ("self_attn.q_proj.weight", lambda l: l.self_attn.q_proj.weight), - ("self_attn.k_proj.weight", lambda l: l.self_attn.k_proj.weight), - ("self_attn.v_proj.weight", lambda l: l.self_attn.v_proj.weight), - ("self_attn.o_proj.weight", lambda l: l.self_attn.o_proj.weight), - ("mlp.gate_proj.weight", lambda l: l.mlp.gate_proj.weight), - ("mlp.up_proj.weight", lambda l: l.mlp.up_proj.weight), - ("mlp.down_proj.weight", lambda l: l.mlp.down_proj.weight), - ] - - def ties_merge(tensors, density): - # tensors: list of (out, in) float tensors, same shape - stack = torch.stack([t.float() for t in tensors], dim=0) # (N, out, in) - # --- Step 1: Trim to top-density fraction per tensor --- - n = stack.shape[0] - flat = stack.view(n, -1) - k = int(flat.shape[1] * density) - abs_flat = flat.abs() - # Find magnitude threshold per tensor at top-k - topk_vals, _ = abs_flat.topk(k=k, dim=1) - threshold = topk_vals[:, -1:].expand_as(abs_flat) - mask = abs_flat >= threshold - trimmed = (flat * mask.float()).view_as(stack) - # --- Step 2: Elect sign (majority by total magnitude) --- - mag_per_sign = trimmed.sum(dim=0) # (out, in), signed sum - elected = torch.sign(mag_per_sign) # +1/-1/0 - # --- Step 3: Disjoint merge (average params agreeing with elected sign) --- - agree = (torch.sign(trimmed) == elected.unsqueeze(0)).float() - contributing_count = agree.sum(dim=0).clamp_min(1) - merged_sum = (trimmed * agree).sum(dim=0) - merged = merged_sum / contributing_count - return merged - - merged = {} - for name, getter in param_names: - tensors = [getter(l).data for l in layers] - merged[name] = ties_merge(tensors, density).to(getter(layers[0]).data.dtype) - - for l in layers: - l.self_attn.q_proj.weight.data.copy_(merged["self_attn.q_proj.weight"]) - l.self_attn.k_proj.weight.data.copy_(merged["self_attn.k_proj.weight"]) - l.self_attn.v_proj.weight.data.copy_(merged["self_attn.v_proj.weight"]) - l.self_attn.o_proj.weight.data.copy_(merged["self_attn.o_proj.weight"]) - l.mlp.gate_proj.weight.data.copy_(merged["mlp.gate_proj.weight"]) - l.mlp.up_proj.weight.data.copy_(merged["mlp.up_proj.weight"]) - l.mlp.down_proj.weight.data.copy_(merged["mlp.down_proj.weight"]) - - for L in range(BLOCK_START, BLOCK_END + 1): - predicted = SCHEDULE_A * math.exp(SCHEDULE_B * L) - gamma = model.model.layers[L].input_layernorm.weight.data - gamma.mul_(predicted / gamma.norm().item()) - - elif variant == "merged_op": - # Arithmetic mean, for each block in BLOCKS (can be multiple) - for (bs, be) in BLOCKS: - _merge_block(model, bs, be) - return - - elif variant == "aligned_merged_op": - # Procrustes-align per-head d_h basis to block-middle, then mean. - # FF averaged naively (SiLU breaks rotation gauge for FF). - for (bs, be) in BLOCKS: - _aligned_merge_block(model, bs, be, align_ff=False) - return - - elif variant == "flat_merged_op": - # Mean projections AND flatten γ across block. Everything in block - # becomes N copies of the same operator. If block is truly high-T - # diffusion, PPL should match merged_op (schedule is gauge, not - # load-bearing). If schedule helps, flattening γ will hurt. - for (bs, be) in BLOCKS: - layers = [model.model.layers[L] for L in range(bs, be + 1)] - param_names = [ - ("self_attn.q_proj.weight", lambda l: l.self_attn.q_proj.weight), - ("self_attn.k_proj.weight", lambda l: l.self_attn.k_proj.weight), - ("self_attn.v_proj.weight", lambda l: l.self_attn.v_proj.weight), - ("self_attn.o_proj.weight", lambda l: l.self_attn.o_proj.weight), - ("mlp.gate_proj.weight", lambda l: l.mlp.gate_proj.weight), - ("mlp.up_proj.weight", lambda l: l.mlp.up_proj.weight), - ("mlp.down_proj.weight", lambda l: l.mlp.down_proj.weight), - ] - merged = {} - for name, getter in param_names: - stack = torch.stack([getter(l).data.float() for l in layers], dim=0) - merged[name] = stack.mean(dim=0).to(getter(layers[0]).data.dtype) - gamma_mean = torch.stack([l.input_layernorm.weight.data.float() - for l in layers]).mean(0).to(layers[0].input_layernorm.weight.data.dtype) - post_attn_mean = torch.stack([l.post_attention_layernorm.weight.data.float() - for l in layers]).mean(0).to(layers[0].post_attention_layernorm.weight.data.dtype) - for l in layers: - l.self_attn.q_proj.weight.data.copy_(merged["self_attn.q_proj.weight"]) - l.self_attn.k_proj.weight.data.copy_(merged["self_attn.k_proj.weight"]) - l.self_attn.v_proj.weight.data.copy_(merged["self_attn.v_proj.weight"]) - l.self_attn.o_proj.weight.data.copy_(merged["self_attn.o_proj.weight"]) - l.mlp.gate_proj.weight.data.copy_(merged["mlp.gate_proj.weight"]) - l.mlp.up_proj.weight.data.copy_(merged["mlp.up_proj.weight"]) - l.mlp.down_proj.weight.data.copy_(merged["mlp.down_proj.weight"]) - l.input_layernorm.weight.data.copy_(gamma_mean) - l.post_attention_layernorm.weight.data.copy_(post_attn_mean) - return - - elif variant == "reverse_order": - # Reverse the order of layers within each block to test whether - # the block implements a trajectory (order-dependent) or iid - # diffusion (order-free). - import torch.nn as nn - layers_list = list(model.model.layers) - for (bs, be) in BLOCKS: - rev = layers_list[bs:be + 1][::-1] - layers_list[bs:be + 1] = rev - model.model.layers = nn.ModuleList(layers_list) - # Re-set layer_idx on each layer so attention/cache uses the - # current position, not the original one. - for i, l in enumerate(model.model.layers): - if hasattr(l, "self_attn") and hasattr(l.self_attn, "layer_idx"): - l.self_attn.layer_idx = i - return - - elif variant == "merged_op_OLD_UNREACHABLE": - layers = [model.model.layers[L] for L in range(BLOCK_START, BLOCK_END + 1)] - n = len(layers) - param_names = [ - ("self_attn.q_proj.weight", lambda l: l.self_attn.q_proj.weight), - ("self_attn.k_proj.weight", lambda l: l.self_attn.k_proj.weight), - ("self_attn.v_proj.weight", lambda l: l.self_attn.v_proj.weight), - ("self_attn.o_proj.weight", lambda l: l.self_attn.o_proj.weight), - ("mlp.gate_proj.weight", lambda l: l.mlp.gate_proj.weight), - ("mlp.up_proj.weight", lambda l: l.mlp.up_proj.weight), - ("mlp.down_proj.weight", lambda l: l.mlp.down_proj.weight), - ] - merged = {} - for name, getter in param_names: - stack = torch.stack([getter(l).data.float() for l in layers], dim=0) - merged[name] = stack.mean(dim=0).to(getter(layers[0]).data.dtype) - - for l in layers: - l.self_attn.q_proj.weight.data.copy_(merged["self_attn.q_proj.weight"]) - l.self_attn.k_proj.weight.data.copy_(merged["self_attn.k_proj.weight"]) - l.self_attn.v_proj.weight.data.copy_(merged["self_attn.v_proj.weight"]) - l.self_attn.o_proj.weight.data.copy_(merged["self_attn.o_proj.weight"]) - l.mlp.gate_proj.weight.data.copy_(merged["mlp.gate_proj.weight"]) - l.mlp.up_proj.weight.data.copy_(merged["mlp.up_proj.weight"]) - l.mlp.down_proj.weight.data.copy_(merged["mlp.down_proj.weight"]) - - # Set γ to scheduled values per layer - for L in range(BLOCK_START, BLOCK_END + 1): - predicted = SCHEDULE_A * math.exp(SCHEDULE_B * L) - gamma = model.model.layers[L].input_layernorm.weight.data - gamma.mul_(predicted / gamma.norm().item()) - - elif variant == "uniform_gamma": - mid_L = (BLOCK_START + BLOCK_END) // 2 - mid_gamma = model.model.layers[mid_L].input_layernorm.weight.data.clone() - for L in range(BLOCK_START, BLOCK_END + 1): - model.model.layers[L].input_layernorm.weight.data.copy_(mid_gamma) - - else: - raise ValueError(f"Unknown variant {variant}") - - -@torch.no_grad() -def perplexity(model, tok, texts, max_len=512): - total_nll = 0.0 - total_tok = 0 - for text in texts: - enc = tok(text, return_tensors="pt", truncation=True, max_length=max_len).to("cuda") - if enc.input_ids.shape[1] < 2: - continue - out = model(**enc, labels=enc.input_ids) - n = enc.input_ids.shape[1] - 1 - total_nll += float(out.loss.item()) * n - total_tok += n - return math.exp(total_nll / max(total_tok, 1)) - - -@torch.no_grad() -def generate_sample(model, tok, prompt, max_new=40): - enc = tok(prompt, return_tensors="pt").to("cuda") - out = model.generate(**enc, max_new_tokens=max_new, do_sample=False, - pad_token_id=tok.eos_token_id) - return tok.decode(out[0], skip_special_tokens=True) - - -def run_variant(variant): - model, tok = load_model() - apply_variant(model, variant) - print(f"\n=== variant: {variant} ===", flush=True) - ppl = perplexity(model, tok, CALIB) - print(f" perplexity: {ppl:.3f}", flush=True) - for p in GEN_PROMPTS: - out = generate_sample(model, tok, p) - print(f" [{p!r}] -> {out[:200]!r}", flush=True) - del model - torch.cuda.empty_cache() - return ppl - - -def main(): - ap = argparse.ArgumentParser() - ap.add_argument("--variant", default="all", - choices=["all", "baseline", "schedule_fit", - "single_op", "uniform_gamma", "merged_op", - "aligned_merged_op", "flat_merged_op", - "reverse_order", "ties_op"]) - ap.add_argument("--ties-density", type=float, default=0.2, - help="TIES trim density (fraction of top-magnitude params to keep)") - args = ap.parse_args() - - variants = (["baseline", "schedule_fit", "single_op", "uniform_gamma"] - if args.variant == "all" else [args.variant]) - results = {} - for v in variants: - results[v] = run_variant(v) - - if len(results) > 1: - print("\n=== Summary ===") - b = results.get("baseline", None) - for v, ppl in results.items(): - rel = f" (×{ppl/b:.2f} baseline)" if b else "" - print(f" {v:<15} PPL {ppl:>8.3f}{rel}") - - -if __name__ == "__main__": - main() diff --git a/src/agent/mod.rs b/src/agent/mod.rs index 6cd24ed..1db40b1 100644 --- a/src/agent/mod.rs +++ b/src/agent/mod.rs @@ -59,7 +59,7 @@ const ACTIVITY_LINGER: std::time::Duration = std::time::Duration::from_secs(5); impl Drop for ActivityGuard { fn drop(&mut self) { - { let mut st = self.agent.state.lock_blocking(); + if let Ok(mut st) = self.agent.state.try_lock() { if let Some(entry) = st.activities.iter_mut().find(|a| a.id == self.id) { entry.label.push_str(" (complete)"); entry.expires_at = std::time::Instant::now() + ACTIVITY_LINGER; diff --git a/src/agent/tools/mcp_client.rs b/src/agent/tools/mcp_client.rs index 50c4e47..78c06f8 100644 --- a/src/agent/tools/mcp_client.rs +++ b/src/agent/tools/mcp_client.rs @@ -152,7 +152,7 @@ async fn ensure_init(agent: Option<&std::sync::Arc>) -> Res let msg = format!("MCP server {} failed: {:#}", cfg.name, e); dbglog!("{}", msg); if let Some(a) = agent { - { let mut st = a.state.lock_blocking(); + if let Ok(mut st) = a.state.try_lock() { st.notify(msg); } } diff --git a/src/locks.rs b/src/locks.rs index 6004034..dda4cb2 100644 --- a/src/locks.rs +++ b/src/locks.rs @@ -135,23 +135,6 @@ impl TrackedMutex { location, }) } - - /// Block the current thread until the lock is acquired. - /// Safe to call from sync contexts (UI thread, slash commands) where - /// .await isn't available. Uses block_in_place so the tokio runtime - /// can schedule other tasks while we wait. - #[track_caller] - pub fn lock_blocking(&self) -> TrackedMutexGuard<'_, T> { - let location = Location::caller(); - let guard = tokio::task::block_in_place(|| { - futures::executor::block_on(self.inner.lock()) - }); - TrackedMutexGuard { - guard, - acquired_at: Instant::now(), - location, - } - } } pub struct TrackedMutexGuard<'a, T> { diff --git a/src/subconscious/compare.rs b/src/subconscious/compare.rs index 8e42851..f2652ce 100644 --- a/src/subconscious/compare.rs +++ b/src/subconscious/compare.rs @@ -104,6 +104,6 @@ async fn run( prior_context: render_prior_context(entries, entry_idx, 2), timestamp_ns: node_timestamp_ns(node), }); - { let st = agent.state.lock_blocking(); st.changed.notify_one(); } + if let Ok(st) = agent.state.try_lock() { st.changed.notify_one(); } } } diff --git a/src/subconscious/learn.rs b/src/subconscious/learn.rs index 8c739b4..129e26b 100644 --- a/src/subconscious/learn.rs +++ b/src/subconscious/learn.rs @@ -736,7 +736,7 @@ async fn run_finetune( gen_alternates, &activity, move |c| { shared.lock().unwrap().finetune_candidates.push(c); - { let st = agent.state.lock_blocking(); st.changed.notify_one(); } + if let Ok(st) = agent.state.try_lock() { st.changed.notify_one(); } }, ).await { Ok((above_threshold, max_div)) => FinetuneScoringStats { diff --git a/src/user/chat.rs b/src/user/chat.rs index 0fb8f45..bd2df25 100644 --- a/src/user/chat.rs +++ b/src/user/chat.rs @@ -34,12 +34,12 @@ fn commands() -> Vec { vec![ handler: |s, _| { let _ = s.mind_tx.send(MindCommand::NewSession); } }, SlashCommand { name: "/save", help: "Save session to disk", handler: |s, _| { - { let mut ag = s.agent.state.lock_blocking(); ag.notify("saved"); } + if let Ok(mut ag) = s.agent.state.try_lock() { ag.notify("saved"); } } }, SlashCommand { name: "/model", help: "Show/switch model (/model )", handler: |s, arg| { if arg.is_empty() { - { let mut ag = s.agent.state.lock_blocking(); + if let Ok(mut ag) = s.agent.state.try_lock() { let names = s.agent.app_config.model_names(); let label = if names.is_empty() { format!("model: {}", s.agent.model()) @@ -62,7 +62,7 @@ fn commands() -> Vec { vec![ SlashCommand { name: "/dmn", help: "Show DMN state", handler: |s, _| { let st = s.shared_mind.lock().unwrap(); - { let mut ag = s.agent.state.lock_blocking(); + if let Ok(mut ag) = s.agent.state.try_lock() { ag.notify(format!("DMN: {:?} ({}/{})", st.dmn, st.dmn_turns, st.max_dmn_turns)); } } }, @@ -71,7 +71,7 @@ fn commands() -> Vec { vec![ let mut st = s.shared_mind.lock().unwrap(); st.dmn = crate::mind::subconscious::State::Resting { since: std::time::Instant::now() }; st.dmn_turns = 0; - { let mut ag = s.agent.state.lock_blocking(); ag.notify("DMN sleeping"); } + if let Ok(mut ag) = s.agent.state.try_lock() { ag.notify("DMN sleeping"); } } }, SlashCommand { name: "/wake", help: "Wake DMN to foraging", handler: |s, _| { @@ -79,14 +79,14 @@ fn commands() -> Vec { vec![ if matches!(st.dmn, crate::mind::subconscious::State::Off) { crate::mind::subconscious::set_off(false); } st.dmn = crate::mind::subconscious::State::Foraging; st.dmn_turns = 0; - { let mut ag = s.agent.state.lock_blocking(); ag.notify("DMN foraging"); } + if let Ok(mut ag) = s.agent.state.try_lock() { ag.notify("DMN foraging"); } } }, SlashCommand { name: "/pause", help: "Full stop — no autonomous ticks (Ctrl+P)", handler: |s, _| { let mut st = s.shared_mind.lock().unwrap(); st.dmn = crate::mind::subconscious::State::Paused; st.dmn_turns = 0; - { let mut ag = s.agent.state.lock_blocking(); ag.notify("DMN paused"); } + if let Ok(mut ag) = s.agent.state.try_lock() { ag.notify("DMN paused"); } } }, SlashCommand { name: "/help", help: "Show this help", handler: |s, _| { notify_help(&s.agent); } }, @@ -116,7 +116,7 @@ pub async fn cmd_switch_model( } fn notify_help(agent: &std::sync::Arc) { - { let mut ag = agent.state.lock_blocking(); + if let Ok(mut ag) = agent.state.try_lock() { let mut help = String::new(); for cmd in &commands() { help.push_str(&format!("{:12} {}\n", cmd.name, cmd.help)); @@ -581,10 +581,16 @@ impl InteractScreen { self.pending_display_count = 0; let (generation, entries) = { - let st = self.agent.state.lock_blocking(); + let st = match self.agent.state.try_lock() { + Ok(st) => st, + Err(_) => return, + }; let generation = st.generation; drop(st); - let ctx = self.agent.context.lock_blocking(); + let ctx = match self.agent.context.try_lock() { + Ok(ctx) => ctx, + Err(_) => return, + }; (generation, ctx.conversation().to_vec()) }; @@ -648,7 +654,7 @@ impl InteractScreen { if let Some(cmd) = dispatch_command(input) { (cmd.handler)(self, &input[cmd.name.len()..].trim_start()); } else { - { let mut ag = self.agent.state.lock_blocking(); + if let Ok(mut ag) = self.agent.state.try_lock() { ag.notify(format!("unknown: {}", input.split_whitespace().next().unwrap_or(input))); } } @@ -764,8 +770,9 @@ impl InteractScreen { /// Draw the main (F1) screen — four-pane layout with status bar. fn draw_main(&mut self, frame: &mut Frame, size: Rect, app: &App) { // Main layout: content area + active tools overlay + status bar - let st_guard = app.agent.state.lock_blocking(); - let tool_lines = st_guard.active_tools.len() as u16; + let st_guard = app.agent.state.try_lock().ok(); + let tool_lines = st_guard.as_ref() + .map(|st| st.active_tools.len() as u16).unwrap_or(0); let main_chunks = Layout::default() .direction(Direction::Vertical) .constraints([ @@ -854,9 +861,10 @@ impl InteractScreen { frame.render_widget(gutter, input_chunks[0]); frame.render_widget(&self.textarea, input_chunks[1]); - if !st_guard.active_tools.is_empty() { + if let Some(ref st) = st_guard { + if !st.active_tools.is_empty() { let tool_style = Style::default().fg(Color::Yellow).add_modifier(Modifier::DIM); - let tool_text: Vec = st_guard.active_tools.iter().map(|t| { + let tool_text: Vec = st.active_tools.iter().map(|t| { let elapsed = t.started.elapsed().as_secs(); let line = if t.detail.is_empty() { format!(" [{}] ({}s)", t.name, elapsed) @@ -867,7 +875,7 @@ impl InteractScreen { }).collect(); let tool_para = Paragraph::new(tool_text); frame.render_widget(tool_para, tools_overlay_area); - } + }} // Draw status bar with live activity indicator let timer = if !app.activity.is_empty() { @@ -1018,7 +1026,7 @@ impl ScreenView for InteractScreen { self.sync_from_agent(); // Read status from agent + mind state - { let mut st = self.agent.state.lock_blocking(); + if let Ok(mut st) = self.agent.state.try_lock() { st.expire_activities(); app.status.prompt_tokens = st.last_prompt_tokens; app.status.model = self.agent.model().to_string(); @@ -1028,7 +1036,7 @@ impl ScreenView for InteractScreen { app.activity_started = st.activities.last() .map(|a| a.started); } - { let ctx = self.agent.context.lock_blocking(); + if let Ok(ctx) = self.agent.context.try_lock() { let window = crate::agent::context::context_window(); if window > 0 { let sys = ctx.system().iter().map(|n| n.tokens()).sum::(); diff --git a/src/user/context.rs b/src/user/context.rs index c6765d0..8edd926 100644 --- a/src/user/context.rs +++ b/src/user/context.rs @@ -20,7 +20,10 @@ impl ConsciousScreen { } fn read_context_views(&self) -> Vec { - let ctx = self.agent.context.lock_blocking(); + let ctx = match self.agent.context.try_lock() { + Ok(ctx) => ctx, + Err(_) => return Vec::new(), + }; let mut views: Vec = Vec::new(); @@ -158,7 +161,8 @@ impl ScreenView for ConsciousScreen { ))); lines.push(Line::raw(format!(" Reasoning: {}", app.reasoning_effort))); lines.push(Line::raw(format!(" Running processes: {}", app.running_processes))); - let tool_count = { let st = app.agent.state.lock_blocking(); st.active_tools.len() }; + let tool_count = app.agent.state.try_lock() + .map(|st| st.active_tools.len()).unwrap_or(0); lines.push(Line::raw(format!(" Active tools: {}", tool_count))); let block = pane_block("context") diff --git a/src/user/mod.rs b/src/user/mod.rs index cd617cc..80754a1 100644 --- a/src/user/mod.rs +++ b/src/user/mod.rs @@ -292,7 +292,7 @@ async fn start(cli: crate::user::CliArgs) -> Result<()> { } fn hotkey_cycle_reasoning(mind: &crate::mind::Mind) { - { let mut ag = mind.agent.state.lock_blocking(); + if let Ok(mut ag) = mind.agent.state.try_lock() { let next = match ag.reasoning_effort.as_str() { "none" => "low", "low" => "high", @@ -344,7 +344,7 @@ fn hotkey_cycle_autonomy(mind: &crate::mind::Mind) { }; s.dmn_turns = 0; drop(s); - { let mut ag = mind.agent.state.lock_blocking(); + if let Ok(mut ag) = mind.agent.state.try_lock() { ag.notify(format!("DMN → {}", label)); } } @@ -419,7 +419,7 @@ async fn run( terminal.hide_cursor()?; - { let mut ag = agent.state.lock_blocking(); ag.notify("consciousness v0.3"); } + if let Ok(mut ag) = agent.state.try_lock() { ag.notify("consciousness v0.3"); } // Initial render { @@ -526,7 +526,7 @@ async fn run( } app.walked_count = mind.subconscious_walked().await.len(); if !startup_done { - { let mut ag = agent.state.lock_blocking(); + if let Ok(mut ag) = agent.state.try_lock() { let model = agent.model().to_string(); ag.notify(format!("model: {}", model)); startup_done = true; @@ -545,7 +545,7 @@ async fn run( if let Some(rx_mutex) = STDERR_RX.get() { if let Ok(rx) = rx_mutex.try_lock() { while let Ok(line) = rx.try_recv() { - { let mut ag = agent.state.lock_blocking(); + if let Ok(mut ag) = agent.state.try_lock() { ag.notify(format!("stderr: {}", line)); dirty = true; } diff --git a/src/user/subconscious.rs b/src/user/subconscious.rs index 52ecb1e..c71642d 100644 --- a/src/user/subconscious.rs +++ b/src/user/subconscious.rs @@ -222,30 +222,31 @@ impl SubconsciousScreen { let fork_point = app.agent_state.get(self.selected()) .map(|s| s.fork_point).unwrap_or(0); - { - let ctx = agent.context.lock_blocking(); - let mut views = Vec::new(); - views.push(section_to_view("System", ctx.system())); - views.push(section_to_view("Identity", ctx.identity())); - views.push(section_to_view("Journal", ctx.journal())); + agent.context.try_lock().ok() + .map(|ctx| { + let mut views = Vec::new(); + views.push(section_to_view("System", ctx.system())); + views.push(section_to_view("Identity", ctx.identity())); + views.push(section_to_view("Journal", ctx.journal())); - // Conversation: skip to fork point for subconscious agents - let conv = ctx.conversation(); - let conv_view = section_to_view("Conversation", conv); - let fork = fork_point.min(conv_view.children.len()); - let conv_children: Vec = conv_view.children - .into_iter().skip(fork).collect(); - views.push(SectionView { - name: format!("Conversation ({} entries)", conv_children.len()), - tokens: conv_children.iter().map(|c| c.tokens).sum(), - content: String::new(), - token_ids: Vec::new(), - children: conv_children, - status: String::new(), - }); + // Conversation: skip to fork point for subconscious agents + let conv = ctx.conversation(); + let conv_view = section_to_view("Conversation", conv); + let fork = fork_point.min(conv_view.children.len()); + let conv_children: Vec = conv_view.children + .into_iter().skip(fork).collect(); + views.push(SectionView { + name: format!("Conversation ({} entries)", conv_children.len()), + tokens: conv_children.iter().map(|c| c.tokens).sum(), + content: String::new(), + token_ids: Vec::new(), + children: conv_children, + status: String::new(), + }); - views - } + views + }) + .unwrap_or_default() } fn draw_list(&mut self, frame: &mut Frame, area: Rect, app: &App) { diff --git a/src/user/thalamus.rs b/src/user/thalamus.rs index 83693ef..ed97035 100644 --- a/src/user/thalamus.rs +++ b/src/user/thalamus.rs @@ -45,7 +45,7 @@ impl ScreenView for ThalamusScreen { } KeyCode::Char('t') => { app.think_native = !app.think_native; - { let mut st = app.agent.state.lock_blocking(); + if let Ok(mut st) = app.agent.state.try_lock() { st.think_native = app.think_native; let status = if app.think_native { "enabled" } else { "disabled" }; st.notify(format!("native thinking {}", status)); @@ -53,7 +53,7 @@ impl ScreenView for ThalamusScreen { } KeyCode::Char('T') => { app.think_tool = !app.think_tool; - { let mut st = app.agent.state.lock_blocking(); + if let Ok(mut st) = app.agent.state.try_lock() { st.think_tool = app.think_tool; // Add or remove the think tool from the tools list if app.think_tool { From 91c8451f5cfcf933374c51f090c3cebfb89c6e0b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 1 May 2026 17:58:32 -0400 Subject: [PATCH 14/31] user: fix hotkey_cycle_reasoning after lock_blocking revert The revert at 09896cd dropped the try_lock() wrapper but left an extra closing brace and the async-call site still un-awaited, leaving the tree unbuildable. Re-flow the function body to match the new signature. Co-Authored-By: Proof of Concept --- src/user/mod.rs | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/user/mod.rs b/src/user/mod.rs index 80754a1..a5b7a1b 100644 --- a/src/user/mod.rs +++ b/src/user/mod.rs @@ -291,22 +291,21 @@ async fn start(cli: crate::user::CliArgs) -> Result<()> { ui_handle.join().unwrap_or_else(|_| Err(anyhow::anyhow!("UI thread panicked"))) } -fn hotkey_cycle_reasoning(mind: &crate::mind::Mind) { - if let Ok(mut ag) = mind.agent.state.try_lock() { - let next = match ag.reasoning_effort.as_str() { - "none" => "low", - "low" => "high", - _ => "none", - }; - ag.reasoning_effort = next.to_string(); - let label = match next { - "none" => "off (monologue hidden)", - "low" => "low (brief monologue)", - "high" => "high (full monologue)", - _ => next, - }; - ag.notify(format!("reasoning: {}", label)); - } +async fn hotkey_cycle_reasoning(mind: &crate::mind::Mind) { + let mut ag = mind.agent.state.lock().await; + let next = match ag.reasoning_effort.as_str() { + "none" => "low", + "low" => "high", + _ => "none", + }; + ag.reasoning_effort = next.to_string(); + let label = match next { + "none" => "off (monologue hidden)", + "low" => "low (brief monologue)", + "high" => "high (full monologue)", + _ => next, + }; + ag.notify(format!("reasoning: {}", label)); } async fn hotkey_kill_processes(mind: &crate::mind::Mind) { @@ -592,7 +591,7 @@ async fn run( } else if key.modifiers.contains(KeyModifiers::CONTROL) { match key.code { KeyCode::Char('c') => { app.should_quit = true; } - KeyCode::Char('r') => hotkey_cycle_reasoning(mind), + KeyCode::Char('r') => hotkey_cycle_reasoning(mind).await, KeyCode::Char('k') => hotkey_kill_processes(mind).await, KeyCode::Char('p') => hotkey_cycle_autonomy(mind), _ => {} From a075e305578d4137548263aedc1c294a7dad00f8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 1 May 2026 17:58:35 -0400 Subject: [PATCH 15/31] http: add HttpResponse::bytes() for binary downloads Mirror of text(), but returns raw Bytes without lossy UTF-8 conversion. Needed by the Telegram channel to fetch photo files. Co-Authored-By: Proof of Concept --- src/agent/api/http.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/agent/api/http.rs b/src/agent/api/http.rs index 65b759b..a059426 100644 --- a/src/agent/api/http.rs +++ b/src/agent/api/http.rs @@ -154,6 +154,14 @@ impl HttpResponse { Ok(String::from_utf8_lossy(&bytes).into_owned()) } + /// Read the entire body as raw bytes (for binary downloads). + pub async fn bytes(self) -> Result { + let bytes = self.body.collect().await + .context("reading response body")? + .to_bytes(); + Ok(bytes) + } + /// Read the entire body and deserialize as JSON. pub async fn json(self) -> Result { let bytes = self.body.collect().await From c303653dd0ac726fb6a83fd4249305b7d8f714fe Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 1 May 2026 17:58:43 -0400 Subject: [PATCH 16/31] telegram: bridge photos via [image: ] markers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an incoming update has a photo array, pick the largest size, resolve the file_id via getFile, and download to ~/.consciousness/channels/telegram.logs/media/.. The message line surfaced to the channel is [image: /abs/path/to/file.jpg] so a multimodal Read on the path works end-to-end. On download failure we still surface the caption with an [image: download failed: ...] marker so context isn't lost. Other media types (voice/video/sticker/etc.) log a one-line "skipping" notice — easy hook to extend later. The media/ dir was already being created at startup; this fills in the rest. Co-Authored-By: Proof of Concept --- channels/telegram/src/main.rs | 118 +++++++++++++++++++++++++++++++--- 1 file changed, 108 insertions(+), 10 deletions(-) diff --git a/channels/telegram/src/main.rs b/channels/telegram/src/main.rs index d3753f7..837a044 100644 --- a/channels/telegram/src/main.rs +++ b/channels/telegram/src/main.rs @@ -181,6 +181,8 @@ struct TelegramMessage { chat_id: i64, sender: String, text: String, + /// Absolute path to a downloaded media file (photo, etc.), if any. + media_path: Option, } /// Fetch and parse pending updates from Telegram via long polling. @@ -206,19 +208,107 @@ async fn get_updates( let sender = msg["from"]["first_name"].as_str().unwrap_or("unknown").to_string(); let chat_id = msg["chat"]["id"].as_i64().unwrap_or(0); - if let Some(text) = msg["text"].as_str() { - messages.push(TelegramMessage { - update_id, - chat_id, - sender, - text: text.to_string(), - }); - } + // Photo: array of PhotoSize, largest is last. Download largest, + // surface message with [image: ] marker so the multimodal + // model can Read the image. + let (text, media_path) = if let Some(sizes) = msg["photo"].as_array() { + let caption = msg["caption"].as_str().unwrap_or("").to_string(); + let largest = sizes.last(); + let file_id = largest + .and_then(|s| s["file_id"].as_str()) + .unwrap_or(""); + if file_id.is_empty() { + error!("telegram photo: missing file_id in update {update_id}"); + (caption, None) + } else { + match download_telegram_file(client, token, file_id).await { + Ok(path) => (caption, Some(path)), + Err(e) => { + error!("telegram photo download failed (file_id={file_id}): {e}"); + // Surface what we have: caption plus a marker that + // a photo was sent but couldn't be fetched. + let marker = format!("[image: download failed: {e}]"); + let combined = if caption.is_empty() { + marker + } else { + format!("{marker}\n{caption}") + }; + (combined, None) + } + } + } + } else if let Some(text) = msg["text"].as_str() { + (text.to_string(), None) + } else { + // Other media types (voice, video, sticker, etc.) — skip for now, + // but log so we can extend later. + let kind = ["voice", "video", "sticker", "document", "audio", "animation"] + .iter() + .find(|k| !msg[**k].is_null()) + .copied() + .unwrap_or("unknown"); + info!("telegram: skipping non-text/photo message (kind={kind}, update_id={update_id})"); + continue; + }; + + messages.push(TelegramMessage { + update_id, + chat_id, + sender, + text, + media_path, + }); } } Ok(messages) } +/// Resolve a Telegram file_id to a downloadable URL path via getFile. +async fn get_file_path( + client: &HttpClient, + token: &str, + file_id: &str, +) -> Result> { + let url = format!( + "https://api.telegram.org/bot{}/getFile?file_id={}", + token, file_id, + ); + let response = client.get(&url).await?; + let body = response.text().await?; + let resp: serde_json::Value = serde_json::from_str(&body) + .map_err(|e| format!("getFile JSON parse error: {e}"))?; + if !resp["ok"].as_bool().unwrap_or(false) { + return Err(format!("getFile failed: {}", resp["description"].as_str().unwrap_or("?")).into()); + } + let file_path = resp["result"]["file_path"].as_str() + .ok_or("getFile: missing result.file_path")?; + Ok(file_path.to_string()) +} + +/// Download a Telegram file by file_id into the channel media dir. +/// Returns the absolute local path on success. +async fn download_telegram_file( + client: &HttpClient, + token: &str, + file_id: &str, +) -> Result> { + let file_path = get_file_path(client, token, file_id).await?; + let url = format!("https://api.telegram.org/file/bot{}/{}", token, file_path); + let response = client.get(&url).await?; + let status = response.status(); + if !status.is_success() { + return Err(format!("file download failed: {status}").into()); + } + let bytes = response.bytes().await?; + + let ext = file_path.rsplit('.').next().filter(|e| !e.contains('/')).unwrap_or("dat"); + let media_dir = log_dir().join("media"); + std::fs::create_dir_all(&media_dir)?; + let dest = media_dir.join(format!("{file_id}.{ext}")); + std::fs::write(&dest, &bytes)?; + Ok(dest.to_string_lossy().to_string()) +} + /// Send a text message to a Telegram chat. async fn send_message( client: &HttpClient, @@ -369,11 +459,19 @@ async fn poll_once( let sender_lower = msg.sender.to_lowercase(); let channel = format!("telegram.{}", sender_lower); - channel_log::append_disk_log(&log_dir(), &sender_lower, &msg.sender, &msg.text); + // If the message has media, prepend an [image: ] marker + // so the multimodal model can Read the file directly. + let body = match &msg.media_path { + Some(path) if msg.text.is_empty() => format!("[image: {path}]"), + Some(path) => format!("[image: {path}]\n{}", msg.text), + None => msg.text.clone(), + }; + + channel_log::append_disk_log(&log_dir(), &sender_lower, &msg.sender, &body); let mut s = state.borrow_mut(); s.config.chat_ids.insert(sender_lower, msg.chat_id); - let line = format!("[{}] {}", msg.sender, msg.text); + let line = format!("[{}] {}", msg.sender, body); s.push_message(line, 2, &channel); } From 713bb0772974d2e08889080cb78af33fdd8c3771 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 1 May 2026 18:16:21 -0400 Subject: [PATCH 17/31] =?UTF-8?q?bin:=20add=20ch=20=E2=80=94=20minimal=20c?= =?UTF-8?q?hannel=20CLI=20(send/recv)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Speaks the channel.capnp protocol over the per-daemon Unix socket at ~/.consciousness/channels/.sock. Useful for ad-hoc sends from shell, tests, and out-of-process tools that don't want to embed a capnp client. ch send ch recv [--all-new] [--min-count N] Co-Authored-By: Proof of Concept --- src/bin/ch.rs | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 src/bin/ch.rs diff --git a/src/bin/ch.rs b/src/bin/ch.rs new file mode 100644 index 0000000..025fe3c --- /dev/null +++ b/src/bin/ch.rs @@ -0,0 +1,112 @@ +// `ch` — minimal channel CLI. +// +// ch send +// ch recv [--all-new] [--min-count N] +// +// Connects to ~/.consciousness/channels/.sock and speaks the +// channel.capnp protocol to the appropriate daemon. + +use std::path::PathBuf; +use std::process::ExitCode; + +use capnp_rpc::{rpc_twoparty_capnp, twoparty, RpcSystem}; +use futures::AsyncReadExt; +use tokio_util::compat::TokioAsyncReadCompatExt; + +use consciousness::channel_capnp::channel_server; + +fn channels_dir() -> PathBuf { + dirs::home_dir().unwrap_or_default().join(".consciousness/channels") +} + +fn sock_for(channel: &str) -> PathBuf { + let top = channel.split('.').next().unwrap_or(channel); + channels_dir().join(format!("{top}.sock")) +} + +async fn connect(sock: &std::path::Path) -> Result { + let stream = tokio::net::UnixStream::connect(sock).await + .map_err(|e| format!("connect {}: {e}", sock.display()))?; + let (reader, writer) = stream.compat().split(); + let network = Box::new(twoparty::VatNetwork::new( + futures::io::BufReader::new(reader), + futures::io::BufWriter::new(writer), + rpc_twoparty_capnp::Side::Client, + Default::default(), + )); + let mut rpc = RpcSystem::new(network, None); + let client: channel_server::Client = rpc.bootstrap(rpc_twoparty_capnp::Side::Server); + tokio::task::spawn_local(rpc); + Ok(client) +} + +#[tokio::main(flavor = "current_thread")] +async fn main() -> ExitCode { + let args: Vec = std::env::args().collect(); + if args.len() < 2 { + eprintln!("usage: {} [args...]", args[0]); + return ExitCode::from(2); + } + + let cmd = args[1].clone(); + let local = tokio::task::LocalSet::new(); + let result: Result<(), String> = local.run_until(async move { + match cmd.as_str() { + "send" => { + if args.len() < 4 { + return Err("usage: ch send ".into()); + } + let channel = &args[2]; + let message = args[3..].join(" "); + let sock = sock_for(channel); + let client = connect(&sock).await?; + let mut req = client.send_request(); + req.get().set_channel(channel); + req.get().set_message(&message); + req.send().promise.await.map_err(|e| format!("send: {e}"))?; + println!("sent to {channel}"); + Ok(()) + } + "recv" => { + if args.len() < 3 { + return Err("usage: ch recv [--all-new] [--min-count N]".into()); + } + let channel = &args[2]; + let mut all_new = false; + let mut min_count: u32 = 20; + let mut i = 3; + while i < args.len() { + match args[i].as_str() { + "--all-new" => { all_new = true; i += 1; } + "--min-count" => { + min_count = args.get(i+1) + .ok_or("--min-count needs an argument")? + .parse().map_err(|e| format!("--min-count: {e}"))?; + i += 2; + } + other => return Err(format!("unknown arg: {other}")), + } + } + let sock = sock_for(channel); + let client = connect(&sock).await?; + let mut req = client.recv_request(); + req.get().set_channel(channel); + req.get().set_all_new(all_new); + req.get().set_min_count(min_count); + let reply = req.send().promise.await.map_err(|e| format!("recv: {e}"))?; + let text = reply.get().map_err(|e| e.to_string())? + .get_text().map_err(|e| e.to_string())? + .to_str().map_err(|e| e.to_string())?; + print!("{text}"); + if !text.ends_with('\n') { println!(); } + Ok(()) + } + other => Err(format!("unknown command: {other} (use send|recv)")), + } + }).await; + + match result { + Ok(()) => ExitCode::SUCCESS, + Err(e) => { eprintln!("error: {e}"); ExitCode::from(1) } + } +} From 190eb50ed956f86bbc9fedbc35399d75f4127598 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 1 May 2026 18:56:03 -0400 Subject: [PATCH 18/31] telegram: bound photo download to 60s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HttpClient::request_timeout only covers send_request, not body collect, so a stuck download would otherwise stall the entire long-poll loop indefinitely. tokio::time::timeout at the call site keeps the failure contained — a slow/dead download surfaces as the same [image: download failed: ...] marker as any other error. 60s is generous for the 1-5MB photos Kent typically sends; Telegram's bot getFile cap is 20MB, which would still complete on most connections. Co-Authored-By: Proof of Concept --- channels/telegram/src/main.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/channels/telegram/src/main.rs b/channels/telegram/src/main.rs index 837a044..ec7aa66 100644 --- a/channels/telegram/src/main.rs +++ b/channels/telegram/src/main.rs @@ -221,7 +221,15 @@ async fn get_updates( error!("telegram photo: missing file_id in update {update_id}"); (caption, None) } else { - match download_telegram_file(client, token, file_id).await { + // Bound the download — HttpClient::request_timeout only covers + // send_request, not body collect, so an indefinitely-slow body + // would otherwise stall every subsequent poll. + let dl = tokio::time::timeout( + std::time::Duration::from_secs(60), + download_telegram_file(client, token, file_id), + ).await + .unwrap_or_else(|_| Err("download timed out after 60s".into())); + match dl { Ok(path) => (caption, Some(path)), Err(e) => { error!("telegram photo download failed (file_id={file_id}): {e}"); From 6e3bacb1824463c30879516b10b5e6fd3017953a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 22 May 2026 12:26:05 -0400 Subject: [PATCH 19/31] channel-tmux: resolve pane ids by label, don't persist them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tmux pane ids (%6 etc.) are ephemeral — recycled across pane and tmux-server restarts. The daemon persisted the id in tmux.json5 and kept reusing it, so after a restart a channel would attach to whatever unrelated pane had since inherited that id. (Live: ktest's stored %6 had become a claude pane; the real ktest pane was %10.) Persist only the label — the pane title / window name, which is stable. pipe_pane_reader() is now a connect-retry loop: each attempt, connect_and_stream() resolves the live id with find_pane_by_name(); the loop retries until the pane exists and pipe-pane succeeds, and reconnects the same way if the pipe later drops. send() resolves the id at send time; open() just registers the label and lets the reader find it. Co-Authored-By: Proof of Concept --- channels/tmux/src/main.rs | 182 ++++++++++++++++++++++---------------- 1 file changed, 104 insertions(+), 78 deletions(-) diff --git a/channels/tmux/src/main.rs b/channels/tmux/src/main.rs index f49bdc1..aecb3bd 100644 --- a/channels/tmux/src/main.rs +++ b/channels/tmux/src/main.rs @@ -26,10 +26,12 @@ use consciousness::thalamus::channel_log::ChannelLog; #[derive(Clone, serde::Serialize, serde::Deserialize)] struct PaneConfig { - /// Human-readable label, becomes the channel name "tmux.