Initial production-ready Gemma 3 vLLM ROCm stack

Co-Authored-By: Oz <oz-agent@warp.dev>
2026-04-18 22:53:38 +05:30
commit ef8537e923
18 changed files with 988 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,9 @@
 HF_TOKEN=YOUR_HF_TOKEN_HERE
 VLLM_API_KEY=YOUR_LOCAL_API_KEY_HERE
 GEMMA_MODEL_ID=google/gemma-3-1b-it
 BACKEND_PORT=8000
 FRONTEND_PORT=3000
 HUGGINGFACE_CACHE_DIR=/home/${USER}/.cache/huggingface
 OPEN_WEBUI_DATA_DIR=./frontend/data/open-webui
 VLLM_MAX_MODEL_LEN=4096
 VLLM_GPU_MEMORY_UTILIZATION=0.88
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,25 @@
 # Environment and secrets
 .env
 backend/config/model.env
 frontend/config/frontend.env
 # Python
 __pycache__/
 *.pyc
 *.pyo
 *.pyd
 .venv/
 venv/
 # Editor / OS
 .DS_Store
 .idea/
 .vscode/
 # Logs
 *.log
 # Runtime data
 backend/data/
 frontend/data/
 models/
--- a/README.md
+++ b/README.md
@ -0,0 +1,126 @@
 # gemma3-vllm-stack
 Production-ready self-hosted stack for running **Gemma 3** with **vLLM** on AMD ROCm, plus a browser chat UI suitable for publishing at `chat.bhatfamily.in`.
 ## What this stack provides
 - Dockerized **vLLM OpenAI-compatible API** (`/v1`) backed by Gemma 3 on ROCm.
 - Dockerized **Open WebUI** chat frontend connected to the local vLLM endpoint.
 - Non-interactive scripts for install, restart, uninstall, and smoke testing.
 - Documentation for operations, upgrades, and troubleshooting.
 ## Repository layout
 ```text
 gemma3-vllm-stack/
 ├── .env.example
 ├── .gitignore
 ├── docker-compose.yml
 ├── README.md
 ├── backend/
 │   ├── Dockerfile
 │   └── config/
 │       └── model.env.example
 ├── frontend/
 │   ├── Dockerfile
 │   └── config/
 │       └── frontend.env.example
 ├── scripts/
 │   ├── install.sh
 │   ├── restart.sh
 │   ├── test_api.sh
 │   ├── test_python_client.py
 │   ├── test_ui.sh
 │   └── uninstall.sh
 └── docs/
    ├── ARCHITECTURE.md
    ├── TROUBLESHOOTING.md
    └── UPGRADE_NOTES.md
 ```
 ## Architecture summary
 - `gemma3-vllm` service runs `vllm/vllm-openai-rocm` and exposes `http://localhost:${BACKEND_PORT}/v1`.
 - `chat-ui` service runs Open WebUI and exposes `http://localhost:${FRONTEND_PORT}`.
 - Open WebUI calls `http://gemma3-vllm:8000/v1` on the internal Docker network.
 Detailed architecture: `docs/ARCHITECTURE.md`.
 ## Prerequisites
 - Ubuntu 22.04 LTS (amd64)
 - AMD ROCm-compatible GPU setup with:
  - `/dev/kfd`
  - `/dev/dri`
 - Docker Engine and docker compose plugin (script auto-installs on Ubuntu if missing)
 - Hugging Face token with access to Gemma 3 model (set as `HF_TOKEN`)
 ## Quickstart
 1. Clone from your Gitea server:
   ```bash
   git clone ssh://git@git.bhatfamily.in/rbhat/gemma3-vllm-stack.git
   cd gemma3-vllm-stack
   ```
 2. Create configuration files:
   ```bash
   cp .env.example .env
   cp backend/config/model.env.example backend/config/model.env
   cp frontend/config/frontend.env.example frontend/config/frontend.env
   ```
 3. Edit `.env` and set at least:
   - `HF_TOKEN`
   - `VLLM_API_KEY` (recommended even on LAN)
 4. Install/start stack:
   ```bash
   ./scripts/install.sh
   ```
 5. Run smoke tests:
   ```bash
   ./scripts/test_api.sh
   ./scripts/test_ui.sh
   python3 scripts/test_python_client.py
   ```
 6. Open browser:
   - `http://localhost:3000`
   - Reverse proxy externally to `https://chat.bhatfamily.in`
 ## Operations
 - Restart stack:
  ```bash
  ./scripts/restart.sh
  ```
 - View logs:
  ```bash
  docker compose logs --tail=200 gemma3-vllm chat-ui
  ```
 - Stop and remove stack resources:
  ```bash
  ./scripts/uninstall.sh
  ```
 - Stop/remove stack and purge local cache/model/UI data:
  ```bash
  ./scripts/uninstall.sh --purge
  ```
 ## Upgrade workflow
 ```bash
 git pull
 docker compose pull
 ./scripts/restart.sh
 ```
 More details: `docs/UPGRADE_NOTES.md`.
 ## Default endpoints
 - API base URL: `http://localhost:8000/v1`
 - UI URL: `http://localhost:3000`
 Adjust using `.env`:
 - `BACKEND_PORT`
 - `FRONTEND_PORT`
 - `GEMMA_MODEL_ID`
 ## Notes for `chat.bhatfamily.in`
 This repository intentionally does not terminate TLS. Bindings are plain HTTP on host ports and are designed for external reverse proxy + TLS handling (nginx/Caddy/Cloudflare Tunnel).
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@ -0,0 +1,4 @@
 # Optional backend Dockerfile.
 # This stack uses the official vLLM ROCm image directly from docker-compose.yml.
 # Keep this file for future customizations.
 FROM vllm/vllm-openai-rocm:latest
--- a/backend/config/model.env.example
+++ b/backend/config/model.env.example
@ -0,0 +1,7 @@
 HF_TOKEN=YOUR_HF_TOKEN_HERE
 VLLM_API_KEY=YOUR_LOCAL_API_KEY_HERE
 GEMMA_MODEL_ID=google/gemma-3-1b-it
 BACKEND_PORT=8000
 HUGGINGFACE_CACHE_DIR=/home/${USER}/.cache/huggingface
 VLLM_MAX_MODEL_LEN=4096
 VLLM_GPU_MEMORY_UTILIZATION=0.88
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,69 @@
 services:
  gemma3-vllm:
    image: vllm/vllm-openai-rocm:latest
    container_name: gemma3-vllm
    restart: unless-stopped
    env_file:
      - ./backend/config/model.env
    environment:
      HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN}
      HF_TOKEN: ${HF_TOKEN}
      PYTORCH_ROCM_ARCH: gfx1103
    command:
      - --model
      - ${GEMMA_MODEL_ID:-google/gemma-3-1b-it}
      - --host
      - 0.0.0.0
      - --port
      - "8000"
      - --dtype
      - float16
      - --max-model-len
      - ${VLLM_MAX_MODEL_LEN:-4096}
      - --gpu-memory-utilization
      - ${VLLM_GPU_MEMORY_UTILIZATION:-0.88}
      - --api-key
      - ${VLLM_API_KEY:-local-dev-key}
    devices:
      - /dev/kfd
      - /dev/dri
    group_add:
      - video
    cap_add:
      - SYS_PTRACE
    security_opt:
      - seccomp=unconfined
    ports:
      - "${BACKEND_PORT:-8000}:8000"
    volumes:
      - ${HUGGINGFACE_CACHE_DIR:-/home/${USER}/.cache/huggingface}:/root/.cache/huggingface
      - ./models:/models
    healthcheck:
      test: ["CMD-SHELL", "curl -sf http://localhost:8000/health >/dev/null || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 10
      start_period: 120s
  chat-ui:
    image: ghcr.io/open-webui/open-webui:main
    container_name: gemma3-chat-ui
    restart: unless-stopped
    depends_on:
      gemma3-vllm:
        condition: service_started
    env_file:
      - ./frontend/config/frontend.env
    environment:
      WEBUI_AUTH: "False"
      OPENAI_API_BASE_URL: ${OPENAI_API_BASE_URL:-http://gemma3-vllm:8000/v1}
      OPENAI_API_KEY: ${VLLM_API_KEY:-local-dev-key}
      ENABLE_OPENAI_API: "True"
      ENABLE_OLLAMA_API: "False"
      DEFAULT_MODELS: ${GEMMA_MODEL_ID:-google/gemma-3-1b-it}
      GLOBAL_LOG_LEVEL: INFO
      WEBUI_NAME: Gemma 3 via vLLM
    ports:
      - "${FRONTEND_PORT:-3000}:8080"
    volumes:
      - ${OPEN_WEBUI_DATA_DIR:-./frontend/data/open-webui}:/app/backend/data
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@ -0,0 +1,72 @@
 # Architecture
 ## Component flow
 ```text
 [Browser @ chat.bhatfamily.in]
          |
          | HTTPS (terminated externally)
          v
 [Host reverse proxy (external to this repo)]
          |
          | HTTP -> localhost:3000
          v
 [chat-ui container: Open WebUI]
          |
          | HTTP (docker internal network)
          v
 [gemma3-vllm container: vLLM OpenAI API @ :8000/v1]
          |
          | reads model weights/cache
          v
 [Hugging Face cache + local models dir]
          |
          | ROCm runtime
          v
 [AMD Radeon 780M (RDNA3 iGPU) via /dev/kfd + /dev/dri]
 ```
 ## Services
 ### `gemma3-vllm`
 - Image: `vllm/vllm-openai-rocm:latest`
 - Purpose: Run Gemma 3 instruction model through OpenAI-compatible API.
 - Host port mapping: `${BACKEND_PORT}:8000` (default `8000:8000`)
 - Device passthrough:
  - `/dev/kfd`
  - `/dev/dri`
 - Security/capabilities for ROCm debugging compatibility:
  - `cap_add: SYS_PTRACE`
  - `security_opt: seccomp=unconfined`
  - `group_add: video`
 ### `chat-ui`
 - Image: `ghcr.io/open-webui/open-webui:main`
 - Purpose: Browser chat experience with local persistence in mounted data directory.
 - Host port mapping: `${FRONTEND_PORT}:8080` (default `3000:8080`)
 - Upstream model endpoint on docker network:
  - `OPENAI_API_BASE_URL=http://gemma3-vllm:8000/v1`
 ## Networking
 - Docker Compose default bridge network is used.
 - `chat-ui` resolves `gemma3-vllm` by service name.
 - External access is via host ports:
  - API: `localhost:8000`
  - UI: `localhost:3000`
 ## Storage
 - Hugging Face cache bind mount:
  - Host: `${HUGGINGFACE_CACHE_DIR}`
  - Container: `/root/.cache/huggingface`
 - Optional local models directory:
  - Host: `./models`
  - Container: `/models`
 - Open WebUI data:
  - Host: `${OPEN_WEBUI_DATA_DIR}`
  - Container: `/app/backend/data`
 ## Scaling notes
 This repository is designed for **single-node deployment** on one AMD APU/GPU host.
 For larger deployments later:
 - Move to dedicated GPUs with larger VRAM.
 - Use pinned vLLM image tags and explicit engine tuning.
 - Consider externalized model storage and distributed orchestration (Kubernetes/Swarm/Nomad).
 - Add request routing, autoscaling, and centralized observability.
--- a/docs/README.md
+++ b/docs/README.md
@ -0,0 +1,14 @@
 # Documentation Index
 This folder contains operational and lifecycle documentation for the `gemma3-vllm-stack` repository.
 ## Files
 - `ARCHITECTURE.md`: Component topology, networking, runtime dependencies, and scaling notes.
 - `TROUBLESHOOTING.md`: Common failures and copy-paste diagnostics/fixes for ROCm, Docker, vLLM, and UI issues.
 - `UPGRADE_NOTES.md`: Safe upgrade, rollback, and backup guidance.
 ## Recommended reading order
 1. `ARCHITECTURE.md`
 2. `TROUBLESHOOTING.md`
 3. `UPGRADE_NOTES.md`
 For quick start and day-1 usage, use the repository root `README.md`.
--- a/docs/TROUBLESHOOTING.md
+++ b/docs/TROUBLESHOOTING.md
@ -0,0 +1,172 @@
 # Troubleshooting
 ## ROCm devices not visible in host
 Symptoms:
 - `/dev/kfd` missing
 - `/dev/dri` missing
 - vLLM fails to start with ROCm device errors
 Checks:
 ```bash
 ls -l /dev/kfd /dev/dri
 id
 getent group video
 ```
 Expected:
 - `/dev/kfd` exists
 - `/dev/dri` directory exists
 - user belongs to `video` group
 Fixes:
 ```bash
 sudo usermod -aG video "$USER"
 newgrp video
 ```
 Then verify ROCm tools:
 ```bash
 rocminfo | sed -n '1,120p'
 ```
 If ROCm is not healthy, fix host ROCm installation first.
 ---
 ## Docker and Compose not available
 Symptoms:
 - `docker: command not found`
 - `docker compose version` fails
 Checks:
 ```bash
 docker --version
 docker compose version
 ```
 Fix using install script (Ubuntu):
 ```bash
 ./scripts/install.sh
 ```
 Manual fallback:
 ```bash
 sudo apt-get update
 sudo apt-get install -y ca-certificates curl gnupg
 sudo install -m 0755 -d /etc/apt/keyrings
 curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
 sudo chmod a+r /etc/apt/keyrings/docker.gpg
 echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu jammy stable" | sudo tee /etc/apt/sources.list.d/docker.list >/dev/null
 sudo apt-get update
 sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
 sudo usermod -aG docker "$USER"
 ```
 Log out/in after group change.
 ---
 ## vLLM container exits or fails healthchecks
 Symptoms:
 - `gemma3-vllm` restarting
 - API endpoint unavailable
 Checks:
 ```bash
 docker compose ps
 docker compose logs --tail=200 gemma3-vllm
 ```
 Common causes and fixes:
 1. Missing/invalid Hugging Face token:
 ```bash
 grep -E '^(HF_TOKEN|GEMMA_MODEL_ID)=' .env
 ```
 Ensure `HF_TOKEN` is set to a valid token with access to Gemma 3.
 2. Model ID typo:
 ```bash
 grep '^GEMMA_MODEL_ID=' .env
 ```
 Use a valid model, e.g. `google/gemma-3-1b-it`.
 3. ROCm runtime/device issues:
 ```bash
 docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video ubuntu:22.04 bash -lc 'ls -l /dev/kfd /dev/dri'
 ```
 4. API key mismatch between backend and UI/tests:
 ```bash
 grep -E '^(VLLM_API_KEY|OPENAI_API_BASE_URL)=' .env frontend/config/frontend.env 2>/dev/null || true
 ```
 Keep keys consistent.
 ---
 ## Out-of-memory (OOM) or low VRAM errors
 Symptoms:
 - startup failure referencing memory allocation
 - runtime generation failures
 Checks:
 ```bash
 docker compose logs --tail=300 gemma3-vllm | grep -Ei 'out of memory|oom|memory|cuda|hip|rocm'
 ```
 Mitigations:
 1. Reduce context length in `.env`:
 ```bash
 VLLM_MAX_MODEL_LEN=2048
 ```
 2. Lower GPU memory utilization target:
 ```bash
 VLLM_GPU_MEMORY_UTILIZATION=0.75
 ```
 3. Use a smaller Gemma 3 variant in `.env`.
 4. Restart stack:
 ```bash
 ./scripts/restart.sh
 ```
 ---
 ## UI loads but cannot reach vLLM backend
 Symptoms:
 - Browser opens UI but chat requests fail.
 Checks:
 ```bash
 docker compose ps
 docker compose logs --tail=200 chat-ui
 docker compose logs --tail=200 gemma3-vllm
 ```
 Verify frontend backend URL:
 ```bash
 grep -E '^OPENAI_API_BASE_URL=' frontend/config/frontend.env
 ```
 Expected value:
 ```text
 OPENAI_API_BASE_URL=http://gemma3-vllm:8000/v1
 ```
 Verify API directly from host:
 ```bash
 ./scripts/test_api.sh
 ```
 If API works from host but not UI, recreate frontend:
 ```bash
 docker compose up -d --force-recreate chat-ui
 ```
 ---
 ## Health checks and endpoint validation
 Run all smoke tests:
 ```bash
 ./scripts/test_api.sh
 ./scripts/test_ui.sh
 python3 scripts/test_python_client.py
 ```
 If one fails, inspect corresponding service logs and then restart:
 ```bash
 docker compose logs --tail=200 gemma3-vllm chat-ui
 ./scripts/restart.sh
 ```
--- a/docs/UPGRADE_NOTES.md
+++ b/docs/UPGRADE_NOTES.md
@ -0,0 +1,50 @@
 # Upgrade Notes
 ## Standard safe upgrade path
 From repository root:
 ```bash
 git pull
 docker compose pull
 ./scripts/restart.sh
 ```
 Then run smoke tests:
 ```bash
 ./scripts/test_api.sh
 ./scripts/test_ui.sh
 python3 scripts/test_python_client.py
 ```
 ## Versioning guidance
 - Prefer pinning image tags in `docker-compose.yml` once your deployment is stable.
 - Upgrading vLLM may change runtime defaults or engine behavior; check vLLM release notes before major version jumps.
 - Keep `GEMMA_MODEL_ID` explicit in `.env` to avoid unintentional model drift.
 ## Model upgrade considerations
 When changing Gemma 3 variants (for example, from 1B to larger sizes):
 - Verify host RAM and GPU memory capacity.
 - Expect re-download of model weights and larger disk usage.
 - Re-tune:
  - `VLLM_MAX_MODEL_LEN`
  - `VLLM_GPU_MEMORY_UTILIZATION`
 - Re-run validation scripts after restart.
 ## Backup recommendations
 Before major upgrades, back up local persistent data:
 ```bash
 mkdir -p backups
 tar -czf backups/hf-cache-$(date +%Y%m%d-%H%M%S).tar.gz "${HOME}/.cache/huggingface"
 tar -czf backups/open-webui-data-$(date +%Y%m%d-%H%M%S).tar.gz frontend/data/open-webui
 ```
 If you use local predownloaded models:
 ```bash
 tar -czf backups/models-$(date +%Y%m%d-%H%M%S).tar.gz models
 ```
 ## Rollback approach
 If a new image/model combination fails:
 1. Revert `docker-compose.yml` and `.env` to previous known-good values.
 2. Pull previous pinned images (if pinned by tag/digest).
 3. Restart:
 ```bash
 ./scripts/restart.sh
 ```
 4. Re-run smoke tests.
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@ -0,0 +1,3 @@
 # Optional frontend Dockerfile.
 # This stack uses the official Open WebUI image directly from docker-compose.yml.
 FROM ghcr.io/open-webui/open-webui:main
--- a/frontend/config/frontend.env.example
+++ b/frontend/config/frontend.env.example
@ -0,0 +1,5 @@
 FRONTEND_PORT=3000
 OPENAI_API_BASE_URL=http://gemma3-vllm:8000/v1
 VLLM_API_KEY=YOUR_LOCAL_API_KEY_HERE
 GEMMA_MODEL_ID=google/gemma-3-1b-it
 OPEN_WEBUI_DATA_DIR=./frontend/data/open-webui
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -0,0 +1,155 @@
 #!/usr/bin/env bash
 # Installs prerequisites (if needed), prepares config files, and starts Gemma 3 + vLLM + chat UI stack.
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 log() {
  printf '[install] %s
 ' "$*"
 }
 err() {
  printf '[install][error] %s
 ' "$*" >&2
 }
 require_linux() {
  if [[ "$(uname -s)" != "Linux" ]]; then
    err "This script supports Linux only."
    exit 1
  fi
 }
 install_docker_ubuntu() {
  log "Installing Docker Engine and Compose plugin using official Docker apt repository."
  sudo apt-get update
  sudo apt-get install -y ca-certificates curl gnupg
  sudo install -m 0755 -d /etc/apt/keyrings
  if [[ ! -f /etc/apt/keyrings/docker.gpg ]]; then
    curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
    sudo chmod a+r /etc/apt/keyrings/docker.gpg
  fi
  source /etc/os-release
  local arch
  arch="$(dpkg --print-architecture)"
  local codename
  codename="${VERSION_CODENAME:-jammy}"
  echo "deb [arch=${arch} signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu ${codename} stable"     | sudo tee /etc/apt/sources.list.d/docker.list >/dev/null
  sudo apt-get update
  sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
  if ! sudo systemctl is-active --quiet docker; then
    sudo systemctl enable --now docker
  fi
  if ! getent group docker >/dev/null; then
    sudo groupadd docker
  fi
  if ! id -nG "${USER}" | grep -qw docker; then
    sudo usermod -aG docker "${USER}"
    log "Added ${USER} to docker group. You may need to log out and back in for group membership to apply."
  fi
 }
 check_or_install_docker() {
  local have_docker=1
  local have_compose=1
  if ! command -v docker >/dev/null 2>&1; then
    have_docker=0
  fi
  if ! docker compose version >/dev/null 2>&1; then
    have_compose=0
  fi
  if [[ ${have_docker} -eq 1 && ${have_compose} -eq 1 ]]; then
    log "Docker and Compose plugin are already available."
    return
  fi
  if [[ -f /etc/os-release ]]; then
    source /etc/os-release
    if [[ "${ID:-}" == "ubuntu" ]]; then
      install_docker_ubuntu
      return
    fi
  fi
  err "Docker/Compose missing and automatic installation is implemented for Ubuntu only."
  err "See docs/TROUBLESHOOTING.md#docker-and-compose-not-available"
  exit 1
 }
 prepare_env_files() {
  if [[ ! -f "${REPO_ROOT}/.env" ]]; then
    cp "${REPO_ROOT}/.env.example" "${REPO_ROOT}/.env"
    log "Created .env from .env.example."
    err "IMPORTANT: edit .env and set HF_TOKEN (and optionally VLLM_API_KEY) before production use."
  fi
  if [[ ! -f "${REPO_ROOT}/backend/config/model.env" ]]; then
    cp "${REPO_ROOT}/backend/config/model.env.example" "${REPO_ROOT}/backend/config/model.env"
    log "Created backend/config/model.env from example."
  fi
  if [[ ! -f "${REPO_ROOT}/frontend/config/frontend.env" ]]; then
    cp "${REPO_ROOT}/frontend/config/frontend.env.example" "${REPO_ROOT}/frontend/config/frontend.env"
    log "Created frontend/config/frontend.env from example."
  fi
  mkdir -p "${REPO_ROOT}/models" "${REPO_ROOT}/frontend/data/open-webui"
 }
 warn_if_rocm_devices_missing() {
  if [[ ! -e /dev/kfd || ! -d /dev/dri ]]; then
    err "ROCm device files /dev/kfd or /dev/dri are not available."
    err "See docs/TROUBLESHOOTING.md#rocm-devices-not-visible-in-host"
  fi
 }
 start_stack() {
  log "Pulling container images."
  docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${REPO_ROOT}/.env" pull
  log "Starting containers in detached mode."
  docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${REPO_ROOT}/.env" up -d
 }
 show_status_and_urls() {
  local backend_port frontend_port
  backend_port="$(grep -E '^BACKEND_PORT=' "${REPO_ROOT}/.env" | tail -n1 | cut -d'=' -f2 || true)"
  frontend_port="$(grep -E '^FRONTEND_PORT=' "${REPO_ROOT}/.env" | tail -n1 | cut -d'=' -f2 || true)"
  backend_port="${backend_port:-8000}"
  frontend_port="${frontend_port:-3000}"
  log "Backend status:"
  docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${REPO_ROOT}/.env" ps gemma3-vllm || true
  log "Frontend status:"
  docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${REPO_ROOT}/.env" ps chat-ui || true
  printf '
 '
  log "API endpoint: http://localhost:${backend_port}/v1"
  log "Chat UI endpoint: http://localhost:${frontend_port}"
  log "If startup fails, inspect logs with: docker compose logs --tail=200 gemma3-vllm chat-ui"
 }
 main() {
  require_linux
  check_or_install_docker
  prepare_env_files
  warn_if_rocm_devices_missing
  start_stack
  show_status_and_urls
 }
 main "$@"
--- a/scripts/restart.sh
+++ b/scripts/restart.sh
@ -0,0 +1,25 @@
 #!/usr/bin/env bash
 # Restarts the Gemma 3 vLLM stack and shows service status.
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 ENV_FILE="${REPO_ROOT}/.env"
 log() {
  printf '[restart] %s
 ' "$*"
 }
 if [[ ! -f "${ENV_FILE}" ]]; then
  ENV_FILE="${REPO_ROOT}/.env.example"
 fi
 log "Stopping stack."
 docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${ENV_FILE}" down
 log "Starting stack."
 docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${ENV_FILE}" up -d
 log "Current status:"
 docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${ENV_FILE}" ps
--- a/scripts/test_api.sh
+++ b/scripts/test_api.sh
@ -0,0 +1,54 @@
 #!/usr/bin/env bash
 # Tests local vLLM OpenAI-compatible API using curl and validates response shape.
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 ENV_FILE="${REPO_ROOT}/.env"
 if [[ ! -f "${ENV_FILE}" ]]; then
  echo "[test_api][error] .env file not found. Copy .env.example to .env first." >&2
  exit 1
 fi
 # shellcheck disable=SC1090
 source "${ENV_FILE}"
 BACKEND_PORT="${BACKEND_PORT:-8000}"
 GEMMA_MODEL_ID="${GEMMA_MODEL_ID:-google/gemma-3-1b-it}"
 VLLM_API_KEY="${VLLM_API_KEY:-EMPTY}"
 API_URL="http://localhost:${BACKEND_PORT}/v1/chat/completions"
 payload_file="$(mktemp)"
 response_file="$(mktemp)"
 trap 'rm -f "${payload_file}" "${response_file}"' EXIT
 cat > "${payload_file}" <<JSON
 {
  "model": "${GEMMA_MODEL_ID}",
  "messages": [
    {"role": "system", "content": "You are a concise assistant."},
    {"role": "user", "content": "Say hello from Gemma 3 running on vLLM."}
  ],
  "temperature": 0.2,
  "max_tokens": 64
 }
 JSON
 http_status="$(curl -sS -o "${response_file}" -w '%{http_code}'   -H "Content-Type: application/json"   -H "Authorization: Bearer ${VLLM_API_KEY}"   -X POST "${API_URL}"   --data @"${payload_file}")"
 if [[ ! "${http_status}" =~ ^2 ]]; then
  echo "[test_api][error] API returned HTTP ${http_status}" >&2
  cat "${response_file}" >&2
  echo "[test_api][hint] See docs/TROUBLESHOOTING.md#vllm-container-exits-or-fails-healthchecks" >&2
  exit 1
 fi
 if ! grep -q '"choices"' "${response_file}"; then
  echo "[test_api][error] API response did not include expected 'choices' field." >&2
  cat "${response_file}" >&2
  exit 1
 fi
 echo "[test_api] Success. API responded with expected structure."
 cat "${response_file}"
--- a/scripts/test_python_client.py
+++ b/scripts/test_python_client.py
@ -0,0 +1,75 @@
 #!/usr/bin/env python3
 """Tests local vLLM OpenAI-compatible API using openai>=1.x Python client."""
 from __future__ import annotations
 import os
 import sys
 from pathlib import Path
 def load_dotenv(dotenv_path: Path) -> None:
    if not dotenv_path.exists():
        return
    for raw_line in dotenv_path.read_text(encoding="utf-8").splitlines():
        line = raw_line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, value = line.split("=", 1)
        key = key.strip()
        value = value.strip().strip('"').strip("'")
        os.environ.setdefault(key, value)
 def main() -> int:
    repo_root = Path(__file__).resolve().parent.parent
    load_dotenv(repo_root / ".env")
    backend_port = os.getenv("BACKEND_PORT", "8000")
    model_id = os.getenv("GEMMA_MODEL_ID", "google/gemma-3-1b-it")
    api_key = os.getenv("VLLM_API_KEY", "EMPTY")
    base_url = f"http://localhost:{backend_port}/v1"
    try:
        from openai import OpenAI
    except ImportError:
        print("[test_python_client][error] openai package is not installed.", file=sys.stderr)
        print("Install it with: python3 -m pip install openai", file=sys.stderr)
        return 1
    client = OpenAI(api_key=api_key, base_url=base_url)
    try:
        response = client.chat.completions.create(
            model=model_id,
            messages=[
                {"role": "system", "content": "You are a concise assistant."},
                {
                    "role": "user",
                    "content": "Say hello from Gemma 3 running on vLLM in one sentence.",
                },
            ],
            temperature=0.2,
            max_tokens=64,
        )
    except Exception as exc:
        print(f"[test_python_client][error] Request failed: {exc}", file=sys.stderr)
        print(
            "[test_python_client][hint] See docs/TROUBLESHOOTING.md#vllm-container-exits-or-fails-healthchecks",
            file=sys.stderr,
        )
        return 1
    if not response.choices or not response.choices[0].message:
        print("[test_python_client][error] No completion choices returned.", file=sys.stderr)
        return 1
    content = response.choices[0].message.content or ""
    print("[test_python_client] Success. Assistant response:")
    print(content.strip())
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/scripts/test_ui.sh
+++ b/scripts/test_ui.sh
@ -0,0 +1,25 @@
 #!/usr/bin/env bash
 # Tests whether the chat UI is reachable on localhost frontend port.
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 ENV_FILE="${REPO_ROOT}/.env"
 if [[ -f "${ENV_FILE}" ]]; then
  # shellcheck disable=SC1090
  source "${ENV_FILE}"
 fi
 FRONTEND_PORT="${FRONTEND_PORT:-3000}"
 UI_URL="http://localhost:${FRONTEND_PORT}"
 http_status="$(curl -sS -o /dev/null -w '%{http_code}' "${UI_URL}")"
 if [[ "${http_status}" != "200" && "${http_status}" != "301" && "${http_status}" != "302" ]]; then
  echo "[test_ui][error] UI check failed with HTTP status ${http_status} at ${UI_URL}" >&2
  echo "[test_ui][hint] See docs/TROUBLESHOOTING.md#ui-loads-but-cannot-reach-vllm-backend" >&2
  exit 1
 fi
 echo "[test_ui] Chat UI is reachable at ${UI_URL} (HTTP ${http_status})."
--- a/scripts/uninstall.sh
+++ b/scripts/uninstall.sh
@ -0,0 +1,98 @@
 #!/usr/bin/env bash
 # Stops and removes the Gemma 3 vLLM stack. Optional --purge removes local model/cache data.
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 PURGE=0
 log() {
  printf '[uninstall] %s
 ' "$*"
 }
 err() {
  printf '[uninstall][error] %s
 ' "$*" >&2
 }
 usage() {
  cat <<'EOF'
 Usage: scripts/uninstall.sh [--purge]
 Options:
  --purge    Remove local Hugging Face cache directory and ./models data in addition to containers/volumes.
  -h, --help Show this help message.
 EOF
 }
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --purge)
      PURGE=1
      ;;
    -h|--help)
      usage
      exit 0
      ;;
    *)
      err "Unknown argument: $1"
      usage
      exit 1
      ;;
  esac
  shift
 done
 if [[ ! -f "${REPO_ROOT}/docker-compose.yml" ]]; then
  err "docker-compose.yml not found at ${REPO_ROOT}."
  exit 1
 fi
 ENV_FILE="${REPO_ROOT}/.env"
 if [[ ! -f "${ENV_FILE}" ]]; then
  ENV_FILE="${REPO_ROOT}/.env.example"
 fi
 log "Stopping stack and removing containers, networks, and named/anonymous volumes."
 docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${ENV_FILE}" down -v || true
 if [[ ${PURGE} -eq 1 ]]; then
  log "Purge requested. Removing local data directories used by this stack."
  huggingface_cache_dir="$(grep -E '^HUGGINGFACE_CACHE_DIR=' "${ENV_FILE}" | tail -n1 | cut -d'=' -f2- || true)"
  open_webui_data_dir="$(grep -E '^OPEN_WEBUI_DATA_DIR=' "${ENV_FILE}" | tail -n1 | cut -d'=' -f2- || true)"
  if [[ -n "${huggingface_cache_dir}" ]]; then
    # Expand potential variables such as ${USER}
    evaluated_hf_dir="$(eval "printf '%s' "${huggingface_cache_dir}"")"
    if [[ -d "${evaluated_hf_dir}" ]]; then
      log "Removing Hugging Face cache directory: ${evaluated_hf_dir}"
      rm -rf "${evaluated_hf_dir}"
    else
      log "Hugging Face cache directory not found: ${evaluated_hf_dir}"
    fi
  fi
  if [[ -z "${open_webui_data_dir}" ]]; then
    open_webui_data_dir="./frontend/data/open-webui"
  fi
  if [[ "${open_webui_data_dir}" == ./* ]]; then
    open_webui_data_dir="${REPO_ROOT}/${open_webui_data_dir#./}"
  fi
  if [[ -d "${open_webui_data_dir}" ]]; then
    log "Removing Open WebUI data directory: ${open_webui_data_dir}"
    rm -rf "${open_webui_data_dir}"
  fi
  if [[ -d "${REPO_ROOT}/models" ]]; then
    log "Removing local models directory: ${REPO_ROOT}/models"
    rm -rf "${REPO_ROOT}/models"
  fi
 else
  log "Safe mode enabled (default). Local model/cache data was preserved."
 fi
 log "Uninstall complete."