Consolidate poc-memory and poc-agent configs

poc-memory now reads from poc-agent's config.json5 as the primary config source. Memory-specific settings live in a "memory" section; API credentials are resolved from the shared model/backend config instead of being duplicated. - Add "memory" section to ~/.config/poc-agent/config.json5 - poc-memory config.rs: try shared config first, fall back to legacy JSONL - API fields (base_url, api_key, model) resolved via memory.agent_model -> models -> backend lookup - Add json5 dependency for proper JSON5 parsing - Update provisioning scripts: hermes -> qwen3_coder tool parser Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 21:49:58 -04:00 · 2026-03-19 21:49:58 -04:00 · d9b56a02c3
commit d9b56a02c3
parent 4c7c3c762c
6 changed files with 146 additions and 18 deletions
--- a/scripts/Dockerfile.vllm
+++ b/scripts/Dockerfile.vllm
@ -0,0 +1,26 @@
+FROM nvidia/cuda:12.9.0-devel-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH="/root/.local/bin:${PATH}"
+
+RUN apt-get update -qq && \
+    apt-get install -y -qq python3 python3-pip git && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN pip install --no-cache-dir vllm ninja huggingface_hub
+
+# Pre-download model weights (optional — comment out to pull at runtime)
+# RUN python3 -c "from huggingface_hub import snapshot_download; snapshot_download('Qwen/Qwen3.5-27B')"
+
+EXPOSE 8000
+
+ENTRYPOINT ["vllm", "serve"]
+CMD ["Qwen/Qwen3.5-27B", \
+     "--port", "8000", \
+     "--max-model-len", "262144", \
+     "--gpu-memory-utilization", "0.95", \
+     "--enable-prefix-caching", \
+     "--enable-auto-tool-choice", \
+     "--tool-call-parser", "qwen3_coder", \
+     "--reasoning-parser", "qwen3", \
+     "--uvicorn-log-level", "warning"]
--- a/scripts/provision-mi300x.sh
+++ b/scripts/provision-mi300x.sh
@ -81,8 +81,8 @@ exec vllm serve "$MODEL" \
    --max-model-len "$MAX_MODEL_LEN" \
    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
    --enable-prefix-caching \
-    --tool-call-parser hermes \
    --enable-auto-tool-choice \
+    --tool-call-parser qwen35_coder \
    --reasoning-parser qwen3 \
    --trust-remote-code \
    --max-cudagraph-capture-size 64 \
--- a/scripts/provision-vllm.sh
+++ b/scripts/provision-vllm.sh
@ -51,7 +51,7 @@ exec vllm serve "$MODEL" \
    --max-model-len "$MAX_MODEL_LEN" \
    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION" \
    --enable-prefix-caching \
-    --tool-call-parser hermes \
    --enable-auto-tool-choice \
+    --tool-call-parser qwen35_coder \
    --reasoning-parser=qwen3 \
    --uvicorn-log-level warning