Adjust vLLM runtime defaults for local stack
Co-Authored-By: Oz <oz-agent@warp.dev>
This commit is contained in:
11
.env.example
11
.env.example
@ -5,5 +5,12 @@ BACKEND_PORT=8000
|
||||
FRONTEND_PORT=3000
|
||||
HUGGINGFACE_CACHE_DIR=/home/${USER}/.cache/huggingface
|
||||
OPEN_WEBUI_DATA_DIR=./frontend/data/open-webui
|
||||
VLLM_MAX_MODEL_LEN=4096
|
||||
VLLM_GPU_MEMORY_UTILIZATION=0.88
|
||||
VLLM_MAX_MODEL_LEN=512
|
||||
VLLM_MAX_NUM_BATCHED_TOKENS=256
|
||||
VLLM_CHUNKED_PREFILL_FLAG=--enable-chunked-prefill
|
||||
VLLM_MAX_NUM_SEQS=1
|
||||
VLLM_GPU_MEMORY_UTILIZATION=0.7
|
||||
VLLM_DTYPE=bfloat16
|
||||
|
||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN
|
||||
HSA_OVERRIDE_GFX_VERSION=11.0.0
|
||||
|
||||
Reference in New Issue
Block a user