Fix poc-agent for vllm/Qwen 3.5: reasoning display, tool parser
- Always display reasoning tokens regardless of reasoning_effort setting — Qwen 3.5 thinks natively and the reasoning parser separates it into its own field - Remove chat_template_kwargs that disabled thinking when reasoning_effort was "none" - Add chat_template_kwargs field to ChatRequest for vllm compat - Update provision script: qwen3_xml tool parser, qwen3 reasoning parser, 262K context, 95% GPU memory utilization Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
49ccdf87e1
commit
f83325b44d
3 changed files with 18 additions and 10 deletions
|
|
@ -34,6 +34,7 @@ pub async fn stream(
|
||||||
enabled: reasoning_effort != "none",
|
enabled: reasoning_effort != "none",
|
||||||
effort: Some(reasoning_effort.to_string()),
|
effort: Some(reasoning_effort.to_string()),
|
||||||
}),
|
}),
|
||||||
|
chat_template_kwargs: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let url = format!("{}/chat/completions", base_url);
|
let url = format!("{}/chat/completions", base_url);
|
||||||
|
|
@ -96,14 +97,14 @@ pub async fn stream(
|
||||||
if let Some(ref r) = choice.delta.reasoning_content {
|
if let Some(ref r) = choice.delta.reasoning_content {
|
||||||
reasoning_chars += r.len();
|
reasoning_chars += r.len();
|
||||||
has_reasoning = true;
|
has_reasoning = true;
|
||||||
if reasoning_enabled && !r.is_empty() {
|
if !r.is_empty() {
|
||||||
let _ = ui_tx.send(UiMessage::Reasoning(r.clone()));
|
let _ = ui_tx.send(UiMessage::Reasoning(r.clone()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if let Some(ref r) = choice.delta.reasoning {
|
if let Some(ref r) = choice.delta.reasoning {
|
||||||
reasoning_chars += r.len();
|
reasoning_chars += r.len();
|
||||||
has_reasoning = true;
|
has_reasoning = true;
|
||||||
if reasoning_enabled && !r.is_empty() {
|
if !r.is_empty() {
|
||||||
let _ = ui_tx.send(UiMessage::Reasoning(r.clone()));
|
let _ = ui_tx.send(UiMessage::Reasoning(r.clone()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -111,7 +112,7 @@ pub async fn stream(
|
||||||
let s = r.to_string();
|
let s = r.to_string();
|
||||||
reasoning_chars += s.len();
|
reasoning_chars += s.len();
|
||||||
has_reasoning = true;
|
has_reasoning = true;
|
||||||
if reasoning_enabled && !s.is_empty() && s != "null" {
|
if !s.is_empty() && s != "null" {
|
||||||
let _ = ui_tx.send(UiMessage::Reasoning(s));
|
let _ = ui_tx.send(UiMessage::Reasoning(s));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -129,6 +129,9 @@ pub struct ChatRequest {
|
||||||
/// - reasoning.effort (documented: "none" disables entirely)
|
/// - reasoning.effort (documented: "none" disables entirely)
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
pub reasoning: Option<ReasoningConfig>,
|
pub reasoning: Option<ReasoningConfig>,
|
||||||
|
/// vllm chat template kwargs — used to disable thinking on Qwen 3.5
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub chat_template_kwargs: Option<serde_json::Value>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
|
|
||||||
|
|
@ -6,15 +6,15 @@
|
||||||
# Or just scp this script and run it.
|
# Or just scp this script and run it.
|
||||||
#
|
#
|
||||||
# Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB)
|
# Expects: NVIDIA GPU with sufficient VRAM (B200: 192GB, A100: 80GB)
|
||||||
# Installs: vllm with Qwen 2.5 27B Instruct
|
# Installs: vllm with Qwen 3.5 27B
|
||||||
# Exposes: OpenAI-compatible API on port 8000
|
# Exposes: OpenAI-compatible API on port 8000
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
MODEL="${MODEL:-Qwen/Qwen2.5-27B-Instruct}"
|
MODEL="${MODEL:-Qwen/Qwen3.5-27B}"
|
||||||
PORT="${PORT:-8000}"
|
PORT="${PORT:-8000}"
|
||||||
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
|
MAX_MODEL_LEN="${MAX_MODEL_LEN:-262144}"
|
||||||
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.90}"
|
GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.95}"
|
||||||
|
|
||||||
echo "=== vllm provisioning ==="
|
echo "=== vllm provisioning ==="
|
||||||
echo "Model: $MODEL"
|
echo "Model: $MODEL"
|
||||||
|
|
@ -24,7 +24,10 @@ echo ""
|
||||||
|
|
||||||
# --- Install vllm ---
|
# --- Install vllm ---
|
||||||
echo "Installing vllm..."
|
echo "Installing vllm..."
|
||||||
pip install --upgrade vllm 2>&1 | tail -3
|
pip install --upgrade vllm --break-system-packages 2>&1 | tail -3
|
||||||
|
|
||||||
|
# --- Use persistent storage ---
|
||||||
|
export HF_HOME=/workspace/huggingface
|
||||||
|
|
||||||
# --- Verify GPU ---
|
# --- Verify GPU ---
|
||||||
echo ""
|
echo ""
|
||||||
|
|
@ -34,6 +37,7 @@ echo ""
|
||||||
|
|
||||||
# --- Download model (cached in /root/.cache/huggingface) ---
|
# --- Download model (cached in /root/.cache/huggingface) ---
|
||||||
echo "Downloading model (this may take a while on first run)..."
|
echo "Downloading model (this may take a while on first run)..."
|
||||||
|
pip install --upgrade huggingface_hub --break-system-packages -q 2>/dev/null
|
||||||
python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5
|
python3 -c "from huggingface_hub import snapshot_download; snapshot_download('$MODEL')" 2>&1 | tail -5
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
|
|
@ -47,7 +51,7 @@ exec vllm serve "$MODEL" \
|
||||||
--max-model-len "$MAX_MODEL_LEN" \
|
--max-model-len "$MAX_MODEL_LEN" \
|
||||||
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
|
--gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
|
||||||
--enable-prefix-caching \
|
--enable-prefix-caching \
|
||||||
--tool-call-parser hermes \
|
--tool-call-parser qwen3_xml \
|
||||||
--enable-auto-tool-choice \
|
--enable-auto-tool-choice \
|
||||||
--disable-log-requests \
|
--reasoning-parser=qwen3 \
|
||||||
--uvicorn-log-level warning
|
--uvicorn-log-level warning
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue