FROM nvidia/cuda:12.9.0-devel-ubuntu22.04 ENV DEBIAN_FRONTEND=noninteractive ENV PATH="/root/.local/bin:${PATH}" RUN apt-get update -qq && \ apt-get install -y -qq python3 python3-pip git && \ rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir vllm ninja huggingface_hub # Pre-download model weights (optional — comment out to pull at runtime) # RUN python3 -c "from huggingface_hub import snapshot_download; snapshot_download('Qwen/Qwen3.5-27B')" EXPOSE 8000 ENTRYPOINT ["vllm", "serve"] CMD ["Qwen/Qwen3.5-27B", \ "--port", "8000", \ "--max-model-len", "262144", \ "--gpu-memory-utilization", "0.95", \ "--enable-prefix-caching", \ "--enable-auto-tool-choice", \ "--tool-call-parser", "qwen3_coder", \ "--reasoning-parser", "qwen3", \ "--uvicorn-log-level", "warning"]