From 93f98a0a5d149d362487d1b9379bb06ae9afa0d0 Mon Sep 17 00:00:00 2001
From: ProofOfConcept <poc@bcachefs.org>
Date: Tue, 10 Mar 2026 23:29:01 -0400
Subject: [PATCH] llm: add 5-minute timeout to claude subprocess
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The daemon was getting stuck when a claude subprocess hung — no
completion logged, job blocked forever, pending queue growing.

Use spawn() + watchdog thread instead of blocking output(). The
watchdog sleeps in 1s increments checking a cancel flag, sends
SIGTERM at 5 minutes, SIGKILL after 5s grace. Cancel flag ensures
the watchdog exits promptly when the child finishes normally.
---
 poc-memory/src/agents/llm.rs | 51 +++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 3 deletions(-)

diff --git a/poc-memory/src/agents/llm.rs b/poc-memory/src/agents/llm.rs
index ab4ea55..81cdfc6 100644
--- a/poc-memory/src/agents/llm.rs
+++ b/poc-memory/src/agents/llm.rs
@@ -38,10 +38,14 @@ fn log_usage(agent: &str, model: &str, prompt: &str, response: &str,
     }
 }
 
+/// Maximum time to wait for a claude subprocess before killing it.
+const SUBPROCESS_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(300); // 5 minutes
+
 /// Call a model via claude CLI. Returns the response text.
 ///
 /// Sets PR_SET_PDEATHSIG on the child so it gets SIGTERM if the
 /// parent daemon exits — no more orphaned claude processes.
+/// Times out after 5 minutes to prevent blocking the daemon forever.
 fn call_model(agent: &str, model: &str, prompt: &str) -> Result<String, String> {
     // Write prompt to temp file (claude CLI needs file input for large prompts)
     let tmp = std::env::temp_dir().join(format!("poc-llm-{}-{:?}.txt",
@@ -53,6 +57,8 @@ fn call_model(agent: &str, model: &str, prompt: &str) -> Result<String, String>
     cmd.args(["-p", "--model", model, "--tools", "", "--no-session-persistence",
               "--strict-mcp-config"])
         .stdin(fs::File::open(&tmp).map_err(|e| format!("open temp: {}", e))?)
+        .stdout(std::process::Stdio::piped())
+        .stderr(std::process::Stdio::piped())
         .env_remove("CLAUDECODE");
 
     // Use separate OAuth credentials for agent work if configured
@@ -65,19 +71,58 @@ fn call_model(agent: &str, model: &str, prompt: &str) -> Result<String, String>
 
     let start = std::time::Instant::now();
 
-    let result = unsafe {
+    let mut child = unsafe {
         cmd.pre_exec(|| {
                 libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM);
                 Ok(())
             })
-            .output()
+            .spawn()
+            .map_err(|e| format!("spawn claude: {}", e))?
     };
 
+    // Spawn a watchdog thread that kills the child after the timeout.
+    // Uses a cancellation flag so the thread exits promptly when the child finishes.
+    let child_id = child.id();
+    let cancel = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
+    let cancel_flag = cancel.clone();
+    let watchdog = std::thread::spawn(move || {
+        // Sleep in 1s increments so we can check the cancel flag
+        let deadline = std::time::Instant::now() + SUBPROCESS_TIMEOUT;
+        while std::time::Instant::now() < deadline {
+            if cancel_flag.load(std::sync::atomic::Ordering::Relaxed) {
+                return;
+            }
+            std::thread::sleep(std::time::Duration::from_secs(1));
+        }
+        if cancel_flag.load(std::sync::atomic::Ordering::Relaxed) {
+            return;
+        }
+        // Send SIGTERM, then SIGKILL after 5s grace period
+        unsafe { libc::kill(child_id as i32, libc::SIGTERM); }
+        for _ in 0..5 {
+            std::thread::sleep(std::time::Duration::from_secs(1));
+            if cancel_flag.load(std::sync::atomic::Ordering::Relaxed) {
+                return;
+            }
+        }
+        unsafe { libc::kill(child_id as i32, libc::SIGKILL); }
+    });
+
+    let result = child.wait_with_output();
+
+    // Cancel the watchdog thread
+    cancel.store(true, std::sync::atomic::Ordering::Relaxed);
+    watchdog.join().ok();
+
     fs::remove_file(&tmp).ok();
 
     match result {
         Ok(output) => {
             let elapsed = start.elapsed().as_millis();
+            if elapsed > SUBPROCESS_TIMEOUT.as_millis() - 1000 {
+                log_usage(agent, model, prompt, "TIMEOUT", elapsed, false);
+                return Err(format!("claude timed out after {:.0}s", elapsed as f64 / 1000.0));
+            }
             if output.status.success() {
                 let response = String::from_utf8_lossy(&output.stdout).trim().to_string();
                 log_usage(agent, model, prompt, &response, elapsed, true);
@@ -89,7 +134,7 @@ fn call_model(agent: &str, model: &str, prompt: &str) -> Result<String, String>
                 Err(format!("claude exited {}: {}", output.status, preview.trim()))
             }
         }
-        Err(e) => Err(format!("spawn claude: {}", e)),
+        Err(e) => Err(format!("wait claude: {}", e)),
     }
 }