diff --git a/.env.example b/.env.example index d5879c9..b7a1304 100644 --- a/.env.example +++ b/.env.example @@ -5,5 +5,12 @@ BACKEND_PORT=8000 FRONTEND_PORT=3000 HUGGINGFACE_CACHE_DIR=/home/${USER}/.cache/huggingface OPEN_WEBUI_DATA_DIR=./frontend/data/open-webui -VLLM_MAX_MODEL_LEN=4096 -VLLM_GPU_MEMORY_UTILIZATION=0.88 +VLLM_MAX_MODEL_LEN=512 +VLLM_MAX_NUM_BATCHED_TOKENS=256 +VLLM_CHUNKED_PREFILL_FLAG=--enable-chunked-prefill +VLLM_MAX_NUM_SEQS=1 +VLLM_GPU_MEMORY_UTILIZATION=0.7 +VLLM_DTYPE=bfloat16 + +VLLM_ATTENTION_BACKEND=TRITON_ATTN +HSA_OVERRIDE_GFX_VERSION=11.0.0 diff --git a/docker-compose.yml b/docker-compose.yml index 66be44e..f07fda8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,6 +9,7 @@ services: HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN} HF_TOKEN: ${HF_TOKEN} PYTORCH_ROCM_ARCH: gfx1103 + HSA_OVERRIDE_GFX_VERSION: ${HSA_OVERRIDE_GFX_VERSION:-11.0.0} command: - --model - ${GEMMA_MODEL_ID:-google/gemma-3-1b-it} @@ -17,11 +18,19 @@ services: - --port - "8000" - --dtype - - float16 + - ${VLLM_DTYPE:-bfloat16} + - --attention-backend + - ${VLLM_ATTENTION_BACKEND:-TRITON_ATTN} - --max-model-len - - ${VLLM_MAX_MODEL_LEN:-4096} + - ${VLLM_MAX_MODEL_LEN:-512} + - --max-num-batched-tokens + - ${VLLM_MAX_NUM_BATCHED_TOKENS:-256} + - --max-num-seqs + - ${VLLM_MAX_NUM_SEQS:-1} + - ${VLLM_CHUNKED_PREFILL_FLAG:---enable-chunked-prefill} - --gpu-memory-utilization - - ${VLLM_GPU_MEMORY_UTILIZATION:-0.88} + - ${VLLM_GPU_MEMORY_UTILIZATION:-0.7} + - --enforce-eager - --api-key - ${VLLM_API_KEY:-local-dev-key} devices: