Initial production-ready Gemma 3 vLLM ROCm stack

Co-Authored-By: Oz <oz-agent@warp.dev>
2026-04-18 22:53:38 +05:30
commit ef8537e923
18 changed files with 988 additions and 0 deletions
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -0,0 +1,155 @@
+#!/usr/bin/env bash
+# Installs prerequisites (if needed), prepares config files, and starts Gemma 3 + vLLM + chat UI stack.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+log() {
+  printf '[install] %s
+' "$*"
+}
+
+err() {
+  printf '[install][error] %s
+' "$*" >&2
+}
+
+require_linux() {
+  if [[ "$(uname -s)" != "Linux" ]]; then
+    err "This script supports Linux only."
+    exit 1
+  fi
+}
+
+install_docker_ubuntu() {
+  log "Installing Docker Engine and Compose plugin using official Docker apt repository."
+  sudo apt-get update
+  sudo apt-get install -y ca-certificates curl gnupg
+  sudo install -m 0755 -d /etc/apt/keyrings
+
+  if [[ ! -f /etc/apt/keyrings/docker.gpg ]]; then
+    curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
+    sudo chmod a+r /etc/apt/keyrings/docker.gpg
+  fi
+
+  source /etc/os-release
+  local arch
+  arch="$(dpkg --print-architecture)"
+  local codename
+  codename="${VERSION_CODENAME:-jammy}"
+
+  echo "deb [arch=${arch} signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu ${codename} stable"     | sudo tee /etc/apt/sources.list.d/docker.list >/dev/null
+
+  sudo apt-get update
+  sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+
+  if ! sudo systemctl is-active --quiet docker; then
+    sudo systemctl enable --now docker
+  fi
+
+  if ! getent group docker >/dev/null; then
+    sudo groupadd docker
+  fi
+
+  if ! id -nG "${USER}" | grep -qw docker; then
+    sudo usermod -aG docker "${USER}"
+    log "Added ${USER} to docker group. You may need to log out and back in for group membership to apply."
+  fi
+}
+
+check_or_install_docker() {
+  local have_docker=1
+  local have_compose=1
+
+  if ! command -v docker >/dev/null 2>&1; then
+    have_docker=0
+  fi
+
+  if ! docker compose version >/dev/null 2>&1; then
+    have_compose=0
+  fi
+
+  if [[ ${have_docker} -eq 1 && ${have_compose} -eq 1 ]]; then
+    log "Docker and Compose plugin are already available."
+    return
+  fi
+
+  if [[ -f /etc/os-release ]]; then
+    source /etc/os-release
+    if [[ "${ID:-}" == "ubuntu" ]]; then
+      install_docker_ubuntu
+      return
+    fi
+  fi
+
+  err "Docker/Compose missing and automatic installation is implemented for Ubuntu only."
+  err "See docs/TROUBLESHOOTING.md#docker-and-compose-not-available"
+  exit 1
+}
+
+prepare_env_files() {
+  if [[ ! -f "${REPO_ROOT}/.env" ]]; then
+    cp "${REPO_ROOT}/.env.example" "${REPO_ROOT}/.env"
+    log "Created .env from .env.example."
+    err "IMPORTANT: edit .env and set HF_TOKEN (and optionally VLLM_API_KEY) before production use."
+  fi
+
+  if [[ ! -f "${REPO_ROOT}/backend/config/model.env" ]]; then
+    cp "${REPO_ROOT}/backend/config/model.env.example" "${REPO_ROOT}/backend/config/model.env"
+    log "Created backend/config/model.env from example."
+  fi
+
+  if [[ ! -f "${REPO_ROOT}/frontend/config/frontend.env" ]]; then
+    cp "${REPO_ROOT}/frontend/config/frontend.env.example" "${REPO_ROOT}/frontend/config/frontend.env"
+    log "Created frontend/config/frontend.env from example."
+  fi
+
+  mkdir -p "${REPO_ROOT}/models" "${REPO_ROOT}/frontend/data/open-webui"
+}
+
+warn_if_rocm_devices_missing() {
+  if [[ ! -e /dev/kfd || ! -d /dev/dri ]]; then
+    err "ROCm device files /dev/kfd or /dev/dri are not available."
+    err "See docs/TROUBLESHOOTING.md#rocm-devices-not-visible-in-host"
+  fi
+}
+
+start_stack() {
+  log "Pulling container images."
+  docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${REPO_ROOT}/.env" pull
+
+  log "Starting containers in detached mode."
+  docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${REPO_ROOT}/.env" up -d
+}
+
+show_status_and_urls() {
+  local backend_port frontend_port
+  backend_port="$(grep -E '^BACKEND_PORT=' "${REPO_ROOT}/.env" | tail -n1 | cut -d'=' -f2 || true)"
+  frontend_port="$(grep -E '^FRONTEND_PORT=' "${REPO_ROOT}/.env" | tail -n1 | cut -d'=' -f2 || true)"
+  backend_port="${backend_port:-8000}"
+  frontend_port="${frontend_port:-3000}"
+
+  log "Backend status:"
+  docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${REPO_ROOT}/.env" ps gemma3-vllm || true
+
+  log "Frontend status:"
+  docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${REPO_ROOT}/.env" ps chat-ui || true
+
+  printf '
+'
+  log "API endpoint: http://localhost:${backend_port}/v1"
+  log "Chat UI endpoint: http://localhost:${frontend_port}"
+  log "If startup fails, inspect logs with: docker compose logs --tail=200 gemma3-vllm chat-ui"
+}
+
+main() {
+  require_linux
+  check_or_install_docker
+  prepare_env_files
+  warn_if_rocm_devices_missing
+  start_stack
+  show_status_and_urls
+}
+
+main "$@"
--- a/scripts/restart.sh
+++ b/scripts/restart.sh
@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Restarts the Gemma 3 vLLM stack and shows service status.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+ENV_FILE="${REPO_ROOT}/.env"
+
+log() {
+  printf '[restart] %s
+' "$*"
+}
+
+if [[ ! -f "${ENV_FILE}" ]]; then
+  ENV_FILE="${REPO_ROOT}/.env.example"
+fi
+
+log "Stopping stack."
+docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${ENV_FILE}" down
+
+log "Starting stack."
+docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${ENV_FILE}" up -d
+
+log "Current status:"
+docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${ENV_FILE}" ps
--- a/scripts/test_api.sh
+++ b/scripts/test_api.sh
@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Tests local vLLM OpenAI-compatible API using curl and validates response shape.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+ENV_FILE="${REPO_ROOT}/.env"
+
+if [[ ! -f "${ENV_FILE}" ]]; then
+  echo "[test_api][error] .env file not found. Copy .env.example to .env first." >&2
+  exit 1
+fi
+
+# shellcheck disable=SC1090
+source "${ENV_FILE}"
+
+BACKEND_PORT="${BACKEND_PORT:-8000}"
+GEMMA_MODEL_ID="${GEMMA_MODEL_ID:-google/gemma-3-1b-it}"
+VLLM_API_KEY="${VLLM_API_KEY:-EMPTY}"
+API_URL="http://localhost:${BACKEND_PORT}/v1/chat/completions"
+
+payload_file="$(mktemp)"
+response_file="$(mktemp)"
+trap 'rm -f "${payload_file}" "${response_file}"' EXIT
+
+cat > "${payload_file}" <<JSON
+{
+  "model": "${GEMMA_MODEL_ID}",
+  "messages": [
+    {"role": "system", "content": "You are a concise assistant."},
+    {"role": "user", "content": "Say hello from Gemma 3 running on vLLM."}
+  ],
+  "temperature": 0.2,
+  "max_tokens": 64
+}
+JSON
+
+http_status="$(curl -sS -o "${response_file}" -w '%{http_code}'   -H "Content-Type: application/json"   -H "Authorization: Bearer ${VLLM_API_KEY}"   -X POST "${API_URL}"   --data @"${payload_file}")"
+
+if [[ ! "${http_status}" =~ ^2 ]]; then
+  echo "[test_api][error] API returned HTTP ${http_status}" >&2
+  cat "${response_file}" >&2
+  echo "[test_api][hint] See docs/TROUBLESHOOTING.md#vllm-container-exits-or-fails-healthchecks" >&2
+  exit 1
+fi
+
+if ! grep -q '"choices"' "${response_file}"; then
+  echo "[test_api][error] API response did not include expected 'choices' field." >&2
+  cat "${response_file}" >&2
+  exit 1
+fi
+
+echo "[test_api] Success. API responded with expected structure."
+cat "${response_file}"
--- a/scripts/test_python_client.py
+++ b/scripts/test_python_client.py
@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+"""Tests local vLLM OpenAI-compatible API using openai>=1.x Python client."""
+
+from __future__ import annotations
+
+import os
+import sys
+from pathlib import Path
+
+
+def load_dotenv(dotenv_path: Path) -> None:
+    if not dotenv_path.exists():
+        return
+
+    for raw_line in dotenv_path.read_text(encoding="utf-8").splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        key, value = line.split("=", 1)
+        key = key.strip()
+        value = value.strip().strip('"').strip("'")
+        os.environ.setdefault(key, value)
+
+
+def main() -> int:
+    repo_root = Path(__file__).resolve().parent.parent
+    load_dotenv(repo_root / ".env")
+
+    backend_port = os.getenv("BACKEND_PORT", "8000")
+    model_id = os.getenv("GEMMA_MODEL_ID", "google/gemma-3-1b-it")
+    api_key = os.getenv("VLLM_API_KEY", "EMPTY")
+    base_url = f"http://localhost:{backend_port}/v1"
+
+    try:
+        from openai import OpenAI
+    except ImportError:
+        print("[test_python_client][error] openai package is not installed.", file=sys.stderr)
+        print("Install it with: python3 -m pip install openai", file=sys.stderr)
+        return 1
+
+    client = OpenAI(api_key=api_key, base_url=base_url)
+
+    try:
+        response = client.chat.completions.create(
+            model=model_id,
+            messages=[
+                {"role": "system", "content": "You are a concise assistant."},
+                {
+                    "role": "user",
+                    "content": "Say hello from Gemma 3 running on vLLM in one sentence.",
+                },
+            ],
+            temperature=0.2,
+            max_tokens=64,
+        )
+    except Exception as exc:
+        print(f"[test_python_client][error] Request failed: {exc}", file=sys.stderr)
+        print(
+            "[test_python_client][hint] See docs/TROUBLESHOOTING.md#vllm-container-exits-or-fails-healthchecks",
+            file=sys.stderr,
+        )
+        return 1
+
+    if not response.choices or not response.choices[0].message:
+        print("[test_python_client][error] No completion choices returned.", file=sys.stderr)
+        return 1
+
+    content = response.choices[0].message.content or ""
+    print("[test_python_client] Success. Assistant response:")
+    print(content.strip())
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/test_ui.sh
+++ b/scripts/test_ui.sh
@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# Tests whether the chat UI is reachable on localhost frontend port.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+ENV_FILE="${REPO_ROOT}/.env"
+
+if [[ -f "${ENV_FILE}" ]]; then
+  # shellcheck disable=SC1090
+  source "${ENV_FILE}"
+fi
+
+FRONTEND_PORT="${FRONTEND_PORT:-3000}"
+UI_URL="http://localhost:${FRONTEND_PORT}"
+
+http_status="$(curl -sS -o /dev/null -w '%{http_code}' "${UI_URL}")"
+
+if [[ "${http_status}" != "200" && "${http_status}" != "301" && "${http_status}" != "302" ]]; then
+  echo "[test_ui][error] UI check failed with HTTP status ${http_status} at ${UI_URL}" >&2
+  echo "[test_ui][hint] See docs/TROUBLESHOOTING.md#ui-loads-but-cannot-reach-vllm-backend" >&2
+  exit 1
+fi
+
+echo "[test_ui] Chat UI is reachable at ${UI_URL} (HTTP ${http_status})."
--- a/scripts/uninstall.sh
+++ b/scripts/uninstall.sh
@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+# Stops and removes the Gemma 3 vLLM stack. Optional --purge removes local model/cache data.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+PURGE=0
+
+log() {
+  printf '[uninstall] %s
+' "$*"
+}
+
+err() {
+  printf '[uninstall][error] %s
+' "$*" >&2
+}
+
+usage() {
+  cat <<'EOF'
+Usage: scripts/uninstall.sh [--purge]
+
+Options:
+  --purge    Remove local Hugging Face cache directory and ./models data in addition to containers/volumes.
+  -h, --help Show this help message.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --purge)
+      PURGE=1
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      err "Unknown argument: $1"
+      usage
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+if [[ ! -f "${REPO_ROOT}/docker-compose.yml" ]]; then
+  err "docker-compose.yml not found at ${REPO_ROOT}."
+  exit 1
+fi
+
+ENV_FILE="${REPO_ROOT}/.env"
+if [[ ! -f "${ENV_FILE}" ]]; then
+  ENV_FILE="${REPO_ROOT}/.env.example"
+fi
+
+log "Stopping stack and removing containers, networks, and named/anonymous volumes."
+docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${ENV_FILE}" down -v || true
+
+if [[ ${PURGE} -eq 1 ]]; then
+  log "Purge requested. Removing local data directories used by this stack."
+
+  huggingface_cache_dir="$(grep -E '^HUGGINGFACE_CACHE_DIR=' "${ENV_FILE}" | tail -n1 | cut -d'=' -f2- || true)"
+  open_webui_data_dir="$(grep -E '^OPEN_WEBUI_DATA_DIR=' "${ENV_FILE}" | tail -n1 | cut -d'=' -f2- || true)"
+
+  if [[ -n "${huggingface_cache_dir}" ]]; then
+    # Expand potential variables such as ${USER}
+    evaluated_hf_dir="$(eval "printf '%s' "${huggingface_cache_dir}"")"
+    if [[ -d "${evaluated_hf_dir}" ]]; then
+      log "Removing Hugging Face cache directory: ${evaluated_hf_dir}"
+      rm -rf "${evaluated_hf_dir}"
+    else
+      log "Hugging Face cache directory not found: ${evaluated_hf_dir}"
+    fi
+  fi
+
+  if [[ -z "${open_webui_data_dir}" ]]; then
+    open_webui_data_dir="./frontend/data/open-webui"
+  fi
+
+  if [[ "${open_webui_data_dir}" == ./* ]]; then
+    open_webui_data_dir="${REPO_ROOT}/${open_webui_data_dir#./}"
+  fi
+
+  if [[ -d "${open_webui_data_dir}" ]]; then
+    log "Removing Open WebUI data directory: ${open_webui_data_dir}"
+    rm -rf "${open_webui_data_dir}"
+  fi
+
+  if [[ -d "${REPO_ROOT}/models" ]]; then
+    log "Removing local models directory: ${REPO_ROOT}/models"
+    rm -rf "${REPO_ROOT}/models"
+  fi
+else
+  log "Safe mode enabled (default). Local model/cache data was preserved."
+fi
+
+log "Uninstall complete."