Files
gemma3-vllm-stack/docker-compose.yml
2026-04-19 09:42:52 +05:30

79 lines
2.3 KiB
YAML

services:
gemma3-vllm:
image: vllm/vllm-openai-rocm:latest
container_name: gemma3-vllm
restart: unless-stopped
env_file:
- ./backend/config/model.env
environment:
HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
HF_TOKEN: ${HF_TOKEN}
PYTORCH_ROCM_ARCH: gfx1103
HSA_OVERRIDE_GFX_VERSION: ${HSA_OVERRIDE_GFX_VERSION:-11.0.0}
command:
- --model
- ${GEMMA_MODEL_ID:?GEMMA_MODEL_ID must be set in .env}
- --host
- 0.0.0.0
- --port
- "8000"
- --dtype
- ${VLLM_DTYPE:-bfloat16}
- --attention-backend
- ${VLLM_ATTENTION_BACKEND:-TRITON_ATTN}
- --max-model-len
- ${VLLM_MAX_MODEL_LEN:-512}
- --max-num-batched-tokens
- ${VLLM_MAX_NUM_BATCHED_TOKENS:-256}
- --max-num-seqs
- ${VLLM_MAX_NUM_SEQS:-1}
- ${VLLM_CHUNKED_PREFILL_FLAG:---enable-chunked-prefill}
- --gpu-memory-utilization
- ${VLLM_GPU_MEMORY_UTILIZATION:-0.7}
- --enforce-eager
- --api-key
- ${VLLM_API_KEY:-local-dev-key}
devices:
- /dev/kfd
- /dev/dri
group_add:
- video
cap_add:
- SYS_PTRACE
security_opt:
- seccomp=unconfined
ports:
- "${BACKEND_PORT:-8000}:8000"
volumes:
- ${HUGGINGFACE_CACHE_DIR:-/home/${USER}/.cache/huggingface}:/root/.cache/huggingface
- ./models:/models
healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:8000/health >/dev/null || exit 1"]
interval: 30s
timeout: 10s
retries: 10
start_period: 120s
chat-ui:
image: ghcr.io/open-webui/open-webui:main
container_name: gemma3-chat-ui
restart: unless-stopped
depends_on:
gemma3-vllm:
condition: service_started
env_file:
- ./frontend/config/frontend.env
environment:
WEBUI_AUTH: "False"
OPENAI_API_BASE_URL: ${OPENAI_API_BASE_URL:-http://gemma3-vllm:8000/v1}
OPENAI_API_KEY: ${VLLM_API_KEY:-local-dev-key}
ENABLE_OPENAI_API: "True"
ENABLE_OLLAMA_API: "False"
DEFAULT_MODELS: ${GEMMA_MODEL_ID:?GEMMA_MODEL_ID must be set in .env}
GLOBAL_LOG_LEVEL: INFO
WEBUI_NAME: Gemma 3 via vLLM
ports:
- "${FRONTEND_PORT:-3000}:8080"
volumes:
- ${OPEN_WEBUI_DATA_DIR:-./frontend/data/open-webui}:/app/backend/data