diff --git a/poc-agent/src/api/openai.rs b/poc-agent/src/api/openai.rs
index e34dc5d..e40f59e 100644
--- a/poc-agent/src/api/openai.rs
+++ b/poc-agent/src/api/openai.rs
@@ -34,6 +34,7 @@ pub async fn stream(
             enabled: reasoning_effort != "none",
             effort: Some(reasoning_effort.to_string()),
         }),
+        chat_template_kwargs: None,
     };
 
     let url = format!("{}/chat/completions", base_url);
@@ -96,14 +97,14 @@ pub async fn stream(
             if let Some(ref r) = choice.delta.reasoning_content {
                 reasoning_chars += r.len();
                 has_reasoning = true;
-                if reasoning_enabled && !r.is_empty() {
+                if !r.is_empty() {
                     let _ = ui_tx.send(UiMessage::Reasoning(r.clone()));
                 }
             }
             if let Some(ref r) = choice.delta.reasoning {
                 reasoning_chars += r.len();
                 has_reasoning = true;
-                if reasoning_enabled && !r.is_empty() {
+                if !r.is_empty() {
                     let _ = ui_tx.send(UiMessage::Reasoning(r.clone()));
                 }
             }
@@ -111,7 +112,7 @@ pub async fn stream(
                 let s = r.to_string();
                 reasoning_chars += s.len();
                 has_reasoning = true;
-                if reasoning_enabled && !s.is_empty() && s != "null" {
+                if !s.is_empty() && s != "null" {
                     let _ = ui_tx.send(UiMessage::Reasoning(s));
                 }
             }
diff --git a/poc-agent/src/types.rs b/poc-agent/src/types.rs
index 60d6dd1..2cdc62c 100644
--- a/poc-agent/src/types.rs
+++ b/poc-agent/src/types.rs
@@ -129,6 +129,9 @@ pub struct ChatRequest {
     /// - reasoning.effort (documented: "none" disables entirely)
     #[serde(skip_serializing_if = "Option::is_none")]
     pub reasoning: Option<ReasoningConfig>,
+    /// vllm chat template kwargs — used to disable thinking on Qwen 3.5
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub chat_template_kwargs: Option<serde_json::Value>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/scripts/provision-vllm.sh b/scripts/provision-vllm.sh
index e7b3a91..ee35670 100755
--- a/scripts/provision-vllm.sh
+++ b/scripts/provision-vllm.sh
@@ -6,15 +6,15 @@
 # Or just scp this script and run it.
 #
 # Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB)
-# Installs: vllm with Qwen 2.5 27B Instruct
+# Installs: vllm with Qwen 3.5 27B
 # Exposes: OpenAI-compatible API on port 8000
 
 set -euo pipefail
 
-MODEL="${MODEL:-Qwen/Qwen2.5-27B-Instruct}"
+MODEL="${MODEL:-Qwen/Qwen3.5-27B}"
 PORT="${PORT:-8000}"
-MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
-GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-262144}"
+GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.95}"
 
 echo "=== vllm provisioning ==="
 echo "Model: $MODEL"
@@ -24,7 +24,10 @@ echo ""
 
 # --- Install vllm ---
 echo "Installing vllm..."
-pip install --upgrade vllm 2>&1 | tail -3
+pip install --upgrade vllm --break-system-packages 2>&1 | tail -3
+
+# --- Use persistent storage ---
+export HF_HOME=/workspace/huggingface
 
 # --- Verify GPU ---
 echo ""
@@ -34,6 +37,7 @@ echo ""
 
 # --- Download model (cached in /root/.cache/huggingface) ---
 echo "Downloading model (this may take a while on first run)..."
+pip install --upgrade huggingface_hub --break-system-packages -q 2>/dev/null
 python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5
 echo ""
 
@@ -47,7 +51,7 @@ exec vllm serve "$MODEL" \
     --max-model-len "$MAX_MODEL_LEN" \
     --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
     --enable-prefix-caching \
-    --tool-call-parser hermes \
+    --tool-call-parser qwen3_xml \
     --enable-auto-tool-choice \
-    --disable-log-requests \
+    --reasoning-parser=qwen3 \
     --uvicorn-log-level warning