api: retry transient connection errors, misc fixes

- Retry up to 5 times with exponential backoff (2s, 4s, 8s, 16s)
  on transient errors: IncompleteMessage, connection closed/reset/
  refused, timeouts. Non-transient errors fail immediately.
- tail command: print to stdout instead of stderr
- state_dir rename: output_dir → state_dir throughout knowledge.rs

Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
ProofOfConcept 2026-03-26 17:48:44 -04:00
parent 5d803441c9
commit 3e410347a2
3 changed files with 53 additions and 31 deletions

View file

@ -33,18 +33,18 @@ pub fn cmd_tail(n: usize, full: bool) -> Result<(), String> {
};
let del = if node.deleted { " [DELETED]" } else { "" };
if full {
eprintln!("--- {} (v{}) {} via {} w={:.3}{} ---",
println!("--- {} (v{}) {} via {} w={:.3}{} ---",
node.key, node.version, ts, node.provenance, node.weight, del);
eprintln!("{}\n", node.content);
println!("{}\n", node.content);
} else {
let preview = crate::util::first_n_chars(&node.content, 100).replace('\n', "\\n");
eprintln!(" {} v{} w={:.2}{}",
println!(" {} v{} w={:.2}{}",
ts, node.version, node.weight, del);
eprintln!(" {} via {}", node.key, node.provenance);
println!(" {} via {}", node.key, node.provenance);
if !preview.is_empty() {
eprintln!(" {}", preview);
println!(" {}", preview);
}
eprintln!();
println!();
}
}

View file

@ -61,29 +61,52 @@ pub async fn call_api_with_tools(
for turn in 0..max_turns {
log(&format!("\n=== TURN {} ({} messages) ===\n", turn, messages.len()));
let (msg, usage) = client.chat_completion_stream_temp(
&messages,
Some(&tool_defs),
&ui_tx,
StreamTarget::Autonomous,
&reasoning,
temperature,
).await.map_err(|e| {
let msg_bytes: usize = messages.iter()
.map(|m| m.content_text().len())
.sum();
let err_str = e.to_string();
let hint = if err_str.contains("IncompleteMessage") || err_str.contains("connection closed") {
format!(" — likely exceeded model context window (~{}KB ≈ {}K tokens)",
msg_bytes / 1024, msg_bytes / 4096)
} else {
String::new()
};
format!("API error on turn {} (~{}KB payload, {} messages): {}{}",
turn, msg_bytes / 1024, messages.len(), e, hint)
})?;
let mut last_err = None;
let mut msg_opt = None;
let mut usage_opt = None;
for attempt in 0..5 {
match client.chat_completion_stream_temp(
&messages,
Some(&tool_defs),
&ui_tx,
StreamTarget::Autonomous,
&reasoning,
temperature,
).await {
Ok((msg, usage)) => {
msg_opt = Some(msg);
usage_opt = usage;
break;
}
Err(e) => {
let err_str = e.to_string();
let is_transient = err_str.contains("IncompleteMessage")
|| err_str.contains("connection closed")
|| err_str.contains("connection reset")
|| err_str.contains("timed out")
|| err_str.contains("Connection refused");
if is_transient && attempt < 4 {
log(&format!("transient error (attempt {}): {}, retrying...",
attempt + 1, err_str));
tokio::time::sleep(std::time::Duration::from_secs(2 << attempt)).await;
last_err = Some(e);
continue;
}
let msg_bytes: usize = messages.iter()
.map(|m| m.content_text().len())
.sum();
return Err(format!(
"API error on turn {} (~{}KB payload, {} messages, {} attempts): {}",
turn, msg_bytes / 1024, messages.len(), attempt + 1, e));
}
}
}
let msg = msg_opt.unwrap();
if let Some(ref e) = last_err {
log(&format!("succeeded after retry (previous error: {})", e));
}
if let Some(u) = &usage {
if let Some(u) = &usage_opt {
log(&format!("tokens: {} prompt + {} completion",
u.prompt_tokens, u.completion_tokens));
}

View file

@ -264,9 +264,8 @@ pub fn spawn_agent(
let log_dir = store::memory_dir().join("logs");
fs::create_dir_all(&log_dir).ok();
let agent_log = fs::OpenOptions::new()
.create(true).append(true)
.open(log_dir.join(format!("{}.log", agent_name)))
let agent_log = fs::File::create(
log_dir.join(format!("{}-{}.log", agent_name, store::compact_timestamp())))
.unwrap_or_else(|_| fs::File::create("/dev/null").unwrap());
let child = std::process::Command::new("poc-memory")