Add direct API backend for agent execution

When api_base_url is configured, agents call the LLM directly via OpenAI-compatible API (vllm, llama.cpp, etc.) instead of shelling out to claude CLI. Implements the full tool loop: send prompt, if tool_calls execute them and send results back, repeat until text. This enables running agents against local/remote models like Qwen-27B on a RunPod B200, with no dependency on claude CLI. Config fields: api_base_url, api_key, api_model. Falls back to claude CLI when api_base_url is not set. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 23:05:14 -04:00 · 2026-03-18 23:05:14 -04:00 · a29b6d4c5d
commit a29b6d4c5d
parent 1b48e57f34
6 changed files with 145 additions and 1 deletions
--- a/poc-memory/src/agents/api.rs
+++ b/poc-memory/src/agents/api.rs
@ -0,0 +1,115 @@
+// agents/api.rs — Direct API backend for agent execution
+//
+// Uses poc-agent's OpenAI-compatible API client to call models directly
+// (vllm, llama.cpp, OpenRouter, etc.) instead of shelling out to claude CLI.
+// Implements the tool loop: send prompt → if tool_calls, execute them →
+// send results back → repeat until text response.
+//
+// Activated when config has api_base_url set.
+
+use poc_agent::api::ApiClient;
+use poc_agent::types::*;
+use poc_agent::tools::{self, ProcessTracker};
+use poc_agent::ui_channel::StreamTarget;
+
+/// Run an agent prompt through the direct API with tool support.
+/// Returns the final text response after all tool calls are resolved.
+pub async fn call_api_with_tools(
+    agent: &str,
+    prompt: &str,
+    log: &dyn Fn(&str),
+) -> Result<String, String> {
+    let config = crate::config::get();
+
+    let base_url = config.api_base_url.as_deref()
+        .ok_or("api_base_url not configured")?;
+    let api_key = config.api_key.as_deref().unwrap_or("");
+    let model = config.api_model.as_deref().unwrap_or("qwen-2.5-27b");
+
+    let client = ApiClient::new(base_url, api_key, model);
+
+    // Set up a minimal UI channel (we just collect messages, no TUI)
+    let (ui_tx, _ui_rx) = poc_agent::ui_channel::channel();
+
+    // Build tool definitions — just bash for poc-memory commands
+    let all_defs = tools::definitions();
+    let tool_defs: Vec<ToolDef> = all_defs.into_iter()
+        .filter(|d| d.function.name == "bash")
+        .collect();
+    let tracker = ProcessTracker::new();
+
+    // Start with the prompt as a user message
+    let mut messages = vec![Message::user(prompt)];
+
+    let max_turns = 50;
+    for turn in 0..max_turns {
+        log(&format!("API turn {} ({} messages)", turn, messages.len()));
+
+        let (msg, usage) = client.chat_completion_stream(
+            &messages,
+            Some(&tool_defs),
+            &ui_tx,
+            StreamTarget::Autonomous,
+            "none",
+        ).await.map_err(|e| format!("API error: {}", e))?;
+
+        if let Some(u) = &usage {
+            log(&format!("tokens: {} prompt + {} completion",
+                u.prompt_tokens, u.completion_tokens));
+        }
+
+        let has_content = msg.content.is_some();
+        let has_tools = msg.tool_calls.as_ref().is_some_and(|tc| !tc.is_empty());
+
+        if has_tools {
+            // Push the assistant message with tool calls
+            messages.push(msg.clone());
+
+            // Execute each tool call
+            for call in msg.tool_calls.as_ref().unwrap() {
+                log(&format!("tool: {}({})",
+                    call.function.name,
+                    crate::util::first_n_chars(&call.function.arguments, 80)));
+
+                let args: serde_json::Value = serde_json::from_str(&call.function.arguments)
+                    .unwrap_or_default();
+
+                let output = tools::dispatch(&call.function.name, &args, &tracker).await;
+
+                log(&format!("tool result: {} chars", output.text.len()));
+
+                messages.push(Message::tool_result(&call.id, &output.text));
+            }
+            continue;
+        }
+
+        // Text-only response — we're done
+        let text = msg.content_text().to_string();
+        if text.is_empty() && !has_content {
+            log("empty response, retrying");
+            messages.push(Message::user(
+                "[system] Your previous response was empty. Please respond with text or use a tool."
+            ));
+            continue;
+        }
+
+        return Ok(text);
+    }
+
+    Err(format!("agent exceeded {} tool turns", max_turns))
+}
+
+/// Synchronous wrapper — creates a tokio runtime and blocks.
+/// Used by the existing sync call path in knowledge.rs.
+pub fn call_api_with_tools_sync(
+    agent: &str,
+    prompt: &str,
+    log: &dyn Fn(&str),
+) -> Result<String, String> {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .map_err(|e| format!("tokio runtime: {}", e))?;
+
+    rt.block_on(call_api_with_tools(agent, prompt, log))
+}
--- a/poc-memory/src/agents/llm.rs
+++ b/poc-memory/src/agents/llm.rs
@ -184,8 +184,15 @@ pub(crate) fn call_haiku(agent: &str, prompt: &str) -> Result<String, String> {
 }

 /// Call a model using an agent definition's model and tool configuration.
+/// Uses the direct API backend when api_base_url is configured,
+/// otherwise falls back to claude CLI subprocess.
 pub(crate) fn call_for_def(def: &super::defs::AgentDef, prompt: &str) -> Result<String, String> {
-    call_model_with_tools(&def.agent, &def.model, prompt, &def.tools)
+    if crate::config::get().api_base_url.is_some() && !def.tools.is_empty() {
+        let log = |msg: &str| eprintln!("[{}] {}", def.agent, msg);
+        super::api::call_api_with_tools_sync(&def.agent, prompt, &log)
+    } else {
+        call_model_with_tools(&def.agent, &def.model, prompt, &def.tools)
+    }
 }

 /// Parse a JSON response, handling markdown fences.
--- a/poc-memory/src/agents/mod.rs
+++ b/poc-memory/src/agents/mod.rs
@ -16,6 +16,7 @@
 //   transcript   — shared JSONL transcript parsing

 pub mod transcript;
+pub mod api;
 pub mod llm;
 pub mod prompts;
 pub mod defs;