FROM nvidia/cuda:12.9.0-devel-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive
ENV PATH="/root/.local/bin:${PATH}"

RUN apt-get update -qq && \
    apt-get install -y -qq python3 python3-pip git && \
    rm -rf /var/lib/apt/lists/*

RUN pip install --no-cache-dir vllm ninja huggingface_hub

# Pre-download model weights (optional — comment out to pull at runtime)
# RUN python3 -c "from huggingface_hub import snapshot_download; snapshot_download('Qwen/Qwen3.5-27B')"

EXPOSE 8000

ENTRYPOINT ["vllm", "serve"]
CMD ["Qwen/Qwen3.5-27B", \
     "--port", "8000", \
     "--max-model-len", "262144", \
     "--gpu-memory-utilization", "0.95", \
     "--enable-prefix-caching", \
     "--enable-auto-tool-choice", \
     "--tool-call-parser", "qwen3_coder", \
     "--reasoning-parser", "qwen3", \
     "--uvicorn-log-level", "warning"]