Adjust vLLM runtime defaults for local stack
Co-Authored-By: Oz <oz-agent@warp.dev>
This commit is contained in:
11
.env.example
11
.env.example
@ -5,5 +5,12 @@ BACKEND_PORT=8000
|
|||||||
FRONTEND_PORT=3000
|
FRONTEND_PORT=3000
|
||||||
HUGGINGFACE_CACHE_DIR=/home/${USER}/.cache/huggingface
|
HUGGINGFACE_CACHE_DIR=/home/${USER}/.cache/huggingface
|
||||||
OPEN_WEBUI_DATA_DIR=./frontend/data/open-webui
|
OPEN_WEBUI_DATA_DIR=./frontend/data/open-webui
|
||||||
VLLM_MAX_MODEL_LEN=4096
|
VLLM_MAX_MODEL_LEN=512
|
||||||
VLLM_GPU_MEMORY_UTILIZATION=0.88
|
VLLM_MAX_NUM_BATCHED_TOKENS=256
|
||||||
|
VLLM_CHUNKED_PREFILL_FLAG=--enable-chunked-prefill
|
||||||
|
VLLM_MAX_NUM_SEQS=1
|
||||||
|
VLLM_GPU_MEMORY_UTILIZATION=0.7
|
||||||
|
VLLM_DTYPE=bfloat16
|
||||||
|
|
||||||
|
VLLM_ATTENTION_BACKEND=TRITON_ATTN
|
||||||
|
HSA_OVERRIDE_GFX_VERSION=11.0.0
|
||||||
|
|||||||
@ -9,6 +9,7 @@ services:
|
|||||||
HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
|
HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
|
||||||
HF_TOKEN: ${HF_TOKEN}
|
HF_TOKEN: ${HF_TOKEN}
|
||||||
PYTORCH_ROCM_ARCH: gfx1103
|
PYTORCH_ROCM_ARCH: gfx1103
|
||||||
|
HSA_OVERRIDE_GFX_VERSION: ${HSA_OVERRIDE_GFX_VERSION:-11.0.0}
|
||||||
command:
|
command:
|
||||||
- --model
|
- --model
|
||||||
- ${GEMMA_MODEL_ID:-google/gemma-3-1b-it}
|
- ${GEMMA_MODEL_ID:-google/gemma-3-1b-it}
|
||||||
@ -17,11 +18,19 @@ services:
|
|||||||
- --port
|
- --port
|
||||||
- "8000"
|
- "8000"
|
||||||
- --dtype
|
- --dtype
|
||||||
- float16
|
- ${VLLM_DTYPE:-bfloat16}
|
||||||
|
- --attention-backend
|
||||||
|
- ${VLLM_ATTENTION_BACKEND:-TRITON_ATTN}
|
||||||
- --max-model-len
|
- --max-model-len
|
||||||
- ${VLLM_MAX_MODEL_LEN:-4096}
|
- ${VLLM_MAX_MODEL_LEN:-512}
|
||||||
|
- --max-num-batched-tokens
|
||||||
|
- ${VLLM_MAX_NUM_BATCHED_TOKENS:-256}
|
||||||
|
- --max-num-seqs
|
||||||
|
- ${VLLM_MAX_NUM_SEQS:-1}
|
||||||
|
- ${VLLM_CHUNKED_PREFILL_FLAG:---enable-chunked-prefill}
|
||||||
- --gpu-memory-utilization
|
- --gpu-memory-utilization
|
||||||
- ${VLLM_GPU_MEMORY_UTILIZATION:-0.88}
|
- ${VLLM_GPU_MEMORY_UTILIZATION:-0.7}
|
||||||
|
- --enforce-eager
|
||||||
- --api-key
|
- --api-key
|
||||||
- ${VLLM_API_KEY:-local-dev-key}
|
- ${VLLM_API_KEY:-local-dev-key}
|
||||||
devices:
|
devices:
|
||||||
|
|||||||
Reference in New Issue
Block a user