From ef8537e923dc401c93e8252b3884b52a22dd69d3 Mon Sep 17 00:00:00 2001 From: Raghav <1858838+quantumrag@users.noreply.github.com> Date: Sat, 18 Apr 2026 22:53:38 +0530 Subject: [PATCH] Initial production-ready Gemma 3 vLLM ROCm stack Co-Authored-By: Oz --- .env.example | 9 ++ .gitignore | 25 ++++ README.md | 126 ++++++++++++++++++++ backend/Dockerfile | 4 + backend/config/model.env.example | 7 ++ docker-compose.yml | 69 +++++++++++ docs/ARCHITECTURE.md | 72 +++++++++++ docs/README.md | 14 +++ docs/TROUBLESHOOTING.md | 172 +++++++++++++++++++++++++++ docs/UPGRADE_NOTES.md | 50 ++++++++ frontend/Dockerfile | 3 + frontend/config/frontend.env.example | 5 + scripts/install.sh | 155 ++++++++++++++++++++++++ scripts/restart.sh | 25 ++++ scripts/test_api.sh | 54 +++++++++ scripts/test_python_client.py | 75 ++++++++++++ scripts/test_ui.sh | 25 ++++ scripts/uninstall.sh | 98 +++++++++++++++ 18 files changed, 988 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 README.md create mode 100644 backend/Dockerfile create mode 100644 backend/config/model.env.example create mode 100644 docker-compose.yml create mode 100644 docs/ARCHITECTURE.md create mode 100644 docs/README.md create mode 100644 docs/TROUBLESHOOTING.md create mode 100644 docs/UPGRADE_NOTES.md create mode 100644 frontend/Dockerfile create mode 100644 frontend/config/frontend.env.example create mode 100755 scripts/install.sh create mode 100755 scripts/restart.sh create mode 100755 scripts/test_api.sh create mode 100755 scripts/test_python_client.py create mode 100755 scripts/test_ui.sh create mode 100755 scripts/uninstall.sh diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..d5879c9 --- /dev/null +++ b/.env.example @@ -0,0 +1,9 @@ +HF_TOKEN=YOUR_HF_TOKEN_HERE +VLLM_API_KEY=YOUR_LOCAL_API_KEY_HERE +GEMMA_MODEL_ID=google/gemma-3-1b-it +BACKEND_PORT=8000 +FRONTEND_PORT=3000 +HUGGINGFACE_CACHE_DIR=/home/${USER}/.cache/huggingface +OPEN_WEBUI_DATA_DIR=./frontend/data/open-webui +VLLM_MAX_MODEL_LEN=4096 +VLLM_GPU_MEMORY_UTILIZATION=0.88 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d031c94 --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +# Environment and secrets +.env +backend/config/model.env +frontend/config/frontend.env + +# Python +__pycache__/ +*.pyc +*.pyo +*.pyd +.venv/ +venv/ + +# Editor / OS +.DS_Store +.idea/ +.vscode/ + +# Logs +*.log + +# Runtime data +backend/data/ +frontend/data/ +models/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..4c0b1c2 --- /dev/null +++ b/README.md @@ -0,0 +1,126 @@ +# gemma3-vllm-stack +Production-ready self-hosted stack for running **Gemma 3** with **vLLM** on AMD ROCm, plus a browser chat UI suitable for publishing at `chat.bhatfamily.in`. + +## What this stack provides +- Dockerized **vLLM OpenAI-compatible API** (`/v1`) backed by Gemma 3 on ROCm. +- Dockerized **Open WebUI** chat frontend connected to the local vLLM endpoint. +- Non-interactive scripts for install, restart, uninstall, and smoke testing. +- Documentation for operations, upgrades, and troubleshooting. + +## Repository layout +```text +gemma3-vllm-stack/ +├── .env.example +├── .gitignore +├── docker-compose.yml +├── README.md +├── backend/ +│ ├── Dockerfile +│ └── config/ +│ └── model.env.example +├── frontend/ +│ ├── Dockerfile +│ └── config/ +│ └── frontend.env.example +├── scripts/ +│ ├── install.sh +│ ├── restart.sh +│ ├── test_api.sh +│ ├── test_python_client.py +│ ├── test_ui.sh +│ └── uninstall.sh +└── docs/ + ├── ARCHITECTURE.md + ├── TROUBLESHOOTING.md + └── UPGRADE_NOTES.md +``` + +## Architecture summary +- `gemma3-vllm` service runs `vllm/vllm-openai-rocm` and exposes `http://localhost:${BACKEND_PORT}/v1`. +- `chat-ui` service runs Open WebUI and exposes `http://localhost:${FRONTEND_PORT}`. +- Open WebUI calls `http://gemma3-vllm:8000/v1` on the internal Docker network. + +Detailed architecture: `docs/ARCHITECTURE.md`. + +## Prerequisites +- Ubuntu 22.04 LTS (amd64) +- AMD ROCm-compatible GPU setup with: + - `/dev/kfd` + - `/dev/dri` +- Docker Engine and docker compose plugin (script auto-installs on Ubuntu if missing) +- Hugging Face token with access to Gemma 3 model (set as `HF_TOKEN`) + +## Quickstart +1. Clone from your Gitea server: + ```bash + git clone ssh://git@git.bhatfamily.in/rbhat/gemma3-vllm-stack.git + cd gemma3-vllm-stack + ``` + +2. Create configuration files: + ```bash + cp .env.example .env + cp backend/config/model.env.example backend/config/model.env + cp frontend/config/frontend.env.example frontend/config/frontend.env + ``` + +3. Edit `.env` and set at least: + - `HF_TOKEN` + - `VLLM_API_KEY` (recommended even on LAN) + +4. Install/start stack: + ```bash + ./scripts/install.sh + ``` + +5. Run smoke tests: + ```bash + ./scripts/test_api.sh + ./scripts/test_ui.sh + python3 scripts/test_python_client.py + ``` + +6. Open browser: + - `http://localhost:3000` + - Reverse proxy externally to `https://chat.bhatfamily.in` + +## Operations +- Restart stack: + ```bash + ./scripts/restart.sh + ``` + +- View logs: + ```bash + docker compose logs --tail=200 gemma3-vllm chat-ui + ``` + +- Stop and remove stack resources: + ```bash + ./scripts/uninstall.sh + ``` + +- Stop/remove stack and purge local cache/model/UI data: + ```bash + ./scripts/uninstall.sh --purge + ``` + +## Upgrade workflow +```bash +git pull +docker compose pull +./scripts/restart.sh +``` +More details: `docs/UPGRADE_NOTES.md`. + +## Default endpoints +- API base URL: `http://localhost:8000/v1` +- UI URL: `http://localhost:3000` + +Adjust using `.env`: +- `BACKEND_PORT` +- `FRONTEND_PORT` +- `GEMMA_MODEL_ID` + +## Notes for `chat.bhatfamily.in` +This repository intentionally does not terminate TLS. Bindings are plain HTTP on host ports and are designed for external reverse proxy + TLS handling (nginx/Caddy/Cloudflare Tunnel). diff --git a/backend/Dockerfile b/backend/Dockerfile new file mode 100644 index 0000000..e2784db --- /dev/null +++ b/backend/Dockerfile @@ -0,0 +1,4 @@ +# Optional backend Dockerfile. +# This stack uses the official vLLM ROCm image directly from docker-compose.yml. +# Keep this file for future customizations. +FROM vllm/vllm-openai-rocm:latest diff --git a/backend/config/model.env.example b/backend/config/model.env.example new file mode 100644 index 0000000..e5d1da6 --- /dev/null +++ b/backend/config/model.env.example @@ -0,0 +1,7 @@ +HF_TOKEN=YOUR_HF_TOKEN_HERE +VLLM_API_KEY=YOUR_LOCAL_API_KEY_HERE +GEMMA_MODEL_ID=google/gemma-3-1b-it +BACKEND_PORT=8000 +HUGGINGFACE_CACHE_DIR=/home/${USER}/.cache/huggingface +VLLM_MAX_MODEL_LEN=4096 +VLLM_GPU_MEMORY_UTILIZATION=0.88 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..66be44e --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,69 @@ +services: + gemma3-vllm: + image: vllm/vllm-openai-rocm:latest + container_name: gemma3-vllm + restart: unless-stopped + env_file: + - ./backend/config/model.env + environment: + HUGGING_FACE_HUB_TOKEN: ${HF_TOKEN} + HF_TOKEN: ${HF_TOKEN} + PYTORCH_ROCM_ARCH: gfx1103 + command: + - --model + - ${GEMMA_MODEL_ID:-google/gemma-3-1b-it} + - --host + - 0.0.0.0 + - --port + - "8000" + - --dtype + - float16 + - --max-model-len + - ${VLLM_MAX_MODEL_LEN:-4096} + - --gpu-memory-utilization + - ${VLLM_GPU_MEMORY_UTILIZATION:-0.88} + - --api-key + - ${VLLM_API_KEY:-local-dev-key} + devices: + - /dev/kfd + - /dev/dri + group_add: + - video + cap_add: + - SYS_PTRACE + security_opt: + - seccomp=unconfined + ports: + - "${BACKEND_PORT:-8000}:8000" + volumes: + - ${HUGGINGFACE_CACHE_DIR:-/home/${USER}/.cache/huggingface}:/root/.cache/huggingface + - ./models:/models + healthcheck: + test: ["CMD-SHELL", "curl -sf http://localhost:8000/health >/dev/null || exit 1"] + interval: 30s + timeout: 10s + retries: 10 + start_period: 120s + + chat-ui: + image: ghcr.io/open-webui/open-webui:main + container_name: gemma3-chat-ui + restart: unless-stopped + depends_on: + gemma3-vllm: + condition: service_started + env_file: + - ./frontend/config/frontend.env + environment: + WEBUI_AUTH: "False" + OPENAI_API_BASE_URL: ${OPENAI_API_BASE_URL:-http://gemma3-vllm:8000/v1} + OPENAI_API_KEY: ${VLLM_API_KEY:-local-dev-key} + ENABLE_OPENAI_API: "True" + ENABLE_OLLAMA_API: "False" + DEFAULT_MODELS: ${GEMMA_MODEL_ID:-google/gemma-3-1b-it} + GLOBAL_LOG_LEVEL: INFO + WEBUI_NAME: Gemma 3 via vLLM + ports: + - "${FRONTEND_PORT:-3000}:8080" + volumes: + - ${OPEN_WEBUI_DATA_DIR:-./frontend/data/open-webui}:/app/backend/data diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..eac2be0 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,72 @@ +# Architecture +## Component flow +```text +[Browser @ chat.bhatfamily.in] + | + | HTTPS (terminated externally) + v +[Host reverse proxy (external to this repo)] + | + | HTTP -> localhost:3000 + v +[chat-ui container: Open WebUI] + | + | HTTP (docker internal network) + v +[gemma3-vllm container: vLLM OpenAI API @ :8000/v1] + | + | reads model weights/cache + v +[Hugging Face cache + local models dir] + | + | ROCm runtime + v +[AMD Radeon 780M (RDNA3 iGPU) via /dev/kfd + /dev/dri] +``` + +## Services +### `gemma3-vllm` +- Image: `vllm/vllm-openai-rocm:latest` +- Purpose: Run Gemma 3 instruction model through OpenAI-compatible API. +- Host port mapping: `${BACKEND_PORT}:8000` (default `8000:8000`) +- Device passthrough: + - `/dev/kfd` + - `/dev/dri` +- Security/capabilities for ROCm debugging compatibility: + - `cap_add: SYS_PTRACE` + - `security_opt: seccomp=unconfined` + - `group_add: video` + +### `chat-ui` +- Image: `ghcr.io/open-webui/open-webui:main` +- Purpose: Browser chat experience with local persistence in mounted data directory. +- Host port mapping: `${FRONTEND_PORT}:8080` (default `3000:8080`) +- Upstream model endpoint on docker network: + - `OPENAI_API_BASE_URL=http://gemma3-vllm:8000/v1` + +## Networking +- Docker Compose default bridge network is used. +- `chat-ui` resolves `gemma3-vllm` by service name. +- External access is via host ports: + - API: `localhost:8000` + - UI: `localhost:3000` + +## Storage +- Hugging Face cache bind mount: + - Host: `${HUGGINGFACE_CACHE_DIR}` + - Container: `/root/.cache/huggingface` +- Optional local models directory: + - Host: `./models` + - Container: `/models` +- Open WebUI data: + - Host: `${OPEN_WEBUI_DATA_DIR}` + - Container: `/app/backend/data` + +## Scaling notes +This repository is designed for **single-node deployment** on one AMD APU/GPU host. + +For larger deployments later: +- Move to dedicated GPUs with larger VRAM. +- Use pinned vLLM image tags and explicit engine tuning. +- Consider externalized model storage and distributed orchestration (Kubernetes/Swarm/Nomad). +- Add request routing, autoscaling, and centralized observability. diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..db802e2 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,14 @@ +# Documentation Index +This folder contains operational and lifecycle documentation for the `gemma3-vllm-stack` repository. + +## Files +- `ARCHITECTURE.md`: Component topology, networking, runtime dependencies, and scaling notes. +- `TROUBLESHOOTING.md`: Common failures and copy-paste diagnostics/fixes for ROCm, Docker, vLLM, and UI issues. +- `UPGRADE_NOTES.md`: Safe upgrade, rollback, and backup guidance. + +## Recommended reading order +1. `ARCHITECTURE.md` +2. `TROUBLESHOOTING.md` +3. `UPGRADE_NOTES.md` + +For quick start and day-1 usage, use the repository root `README.md`. diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md new file mode 100644 index 0000000..3f2183b --- /dev/null +++ b/docs/TROUBLESHOOTING.md @@ -0,0 +1,172 @@ +# Troubleshooting +## ROCm devices not visible in host +Symptoms: +- `/dev/kfd` missing +- `/dev/dri` missing +- vLLM fails to start with ROCm device errors + +Checks: +```bash +ls -l /dev/kfd /dev/dri +id +getent group video +``` + +Expected: +- `/dev/kfd` exists +- `/dev/dri` directory exists +- user belongs to `video` group + +Fixes: +```bash +sudo usermod -aG video "$USER" +newgrp video +``` +Then verify ROCm tools: +```bash +rocminfo | sed -n '1,120p' +``` +If ROCm is not healthy, fix host ROCm installation first. + +--- + +## Docker and Compose not available +Symptoms: +- `docker: command not found` +- `docker compose version` fails + +Checks: +```bash +docker --version +docker compose version +``` + +Fix using install script (Ubuntu): +```bash +./scripts/install.sh +``` +Manual fallback: +```bash +sudo apt-get update +sudo apt-get install -y ca-certificates curl gnupg +sudo install -m 0755 -d /etc/apt/keyrings +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg +sudo chmod a+r /etc/apt/keyrings/docker.gpg +echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu jammy stable" | sudo tee /etc/apt/sources.list.d/docker.list >/dev/null +sudo apt-get update +sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +sudo usermod -aG docker "$USER" +``` +Log out/in after group change. + +--- + +## vLLM container exits or fails healthchecks +Symptoms: +- `gemma3-vllm` restarting +- API endpoint unavailable + +Checks: +```bash +docker compose ps +docker compose logs --tail=200 gemma3-vllm +``` + +Common causes and fixes: +1. Missing/invalid Hugging Face token: +```bash +grep -E '^(HF_TOKEN|GEMMA_MODEL_ID)=' .env +``` +Ensure `HF_TOKEN` is set to a valid token with access to Gemma 3. + +2. Model ID typo: +```bash +grep '^GEMMA_MODEL_ID=' .env +``` +Use a valid model, e.g. `google/gemma-3-1b-it`. + +3. ROCm runtime/device issues: +```bash +docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video ubuntu:22.04 bash -lc 'ls -l /dev/kfd /dev/dri' +``` + +4. API key mismatch between backend and UI/tests: +```bash +grep -E '^(VLLM_API_KEY|OPENAI_API_BASE_URL)=' .env frontend/config/frontend.env 2>/dev/null || true +``` +Keep keys consistent. + +--- + +## Out-of-memory (OOM) or low VRAM errors +Symptoms: +- startup failure referencing memory allocation +- runtime generation failures + +Checks: +```bash +docker compose logs --tail=300 gemma3-vllm | grep -Ei 'out of memory|oom|memory|cuda|hip|rocm' +``` + +Mitigations: +1. Reduce context length in `.env`: +```bash +VLLM_MAX_MODEL_LEN=2048 +``` +2. Lower GPU memory utilization target: +```bash +VLLM_GPU_MEMORY_UTILIZATION=0.75 +``` +3. Use a smaller Gemma 3 variant in `.env`. +4. Restart stack: +```bash +./scripts/restart.sh +``` + +--- + +## UI loads but cannot reach vLLM backend +Symptoms: +- Browser opens UI but chat requests fail. + +Checks: +```bash +docker compose ps +docker compose logs --tail=200 chat-ui +docker compose logs --tail=200 gemma3-vllm +``` + +Verify frontend backend URL: +```bash +grep -E '^OPENAI_API_BASE_URL=' frontend/config/frontend.env +``` +Expected value: +```text +OPENAI_API_BASE_URL=http://gemma3-vllm:8000/v1 +``` + +Verify API directly from host: +```bash +./scripts/test_api.sh +``` + +If API works from host but not UI, recreate frontend: +```bash +docker compose up -d --force-recreate chat-ui +``` + +--- + +## Health checks and endpoint validation +Run all smoke tests: +```bash +./scripts/test_api.sh +./scripts/test_ui.sh +python3 scripts/test_python_client.py +``` + +If one fails, inspect corresponding service logs and then restart: +```bash +docker compose logs --tail=200 gemma3-vllm chat-ui +./scripts/restart.sh +``` diff --git a/docs/UPGRADE_NOTES.md b/docs/UPGRADE_NOTES.md new file mode 100644 index 0000000..0b34cb3 --- /dev/null +++ b/docs/UPGRADE_NOTES.md @@ -0,0 +1,50 @@ +# Upgrade Notes +## Standard safe upgrade path +From repository root: +```bash +git pull +docker compose pull +./scripts/restart.sh +``` +Then run smoke tests: +```bash +./scripts/test_api.sh +./scripts/test_ui.sh +python3 scripts/test_python_client.py +``` + +## Versioning guidance +- Prefer pinning image tags in `docker-compose.yml` once your deployment is stable. +- Upgrading vLLM may change runtime defaults or engine behavior; check vLLM release notes before major version jumps. +- Keep `GEMMA_MODEL_ID` explicit in `.env` to avoid unintentional model drift. + +## Model upgrade considerations +When changing Gemma 3 variants (for example, from 1B to larger sizes): +- Verify host RAM and GPU memory capacity. +- Expect re-download of model weights and larger disk usage. +- Re-tune: + - `VLLM_MAX_MODEL_LEN` + - `VLLM_GPU_MEMORY_UTILIZATION` +- Re-run validation scripts after restart. + +## Backup recommendations +Before major upgrades, back up local persistent data: +```bash +mkdir -p backups +tar -czf backups/hf-cache-$(date +%Y%m%d-%H%M%S).tar.gz "${HOME}/.cache/huggingface" +tar -czf backups/open-webui-data-$(date +%Y%m%d-%H%M%S).tar.gz frontend/data/open-webui +``` +If you use local predownloaded models: +```bash +tar -czf backups/models-$(date +%Y%m%d-%H%M%S).tar.gz models +``` + +## Rollback approach +If a new image/model combination fails: +1. Revert `docker-compose.yml` and `.env` to previous known-good values. +2. Pull previous pinned images (if pinned by tag/digest). +3. Restart: +```bash +./scripts/restart.sh +``` +4. Re-run smoke tests. diff --git a/frontend/Dockerfile b/frontend/Dockerfile new file mode 100644 index 0000000..6eac168 --- /dev/null +++ b/frontend/Dockerfile @@ -0,0 +1,3 @@ +# Optional frontend Dockerfile. +# This stack uses the official Open WebUI image directly from docker-compose.yml. +FROM ghcr.io/open-webui/open-webui:main diff --git a/frontend/config/frontend.env.example b/frontend/config/frontend.env.example new file mode 100644 index 0000000..80e0dc3 --- /dev/null +++ b/frontend/config/frontend.env.example @@ -0,0 +1,5 @@ +FRONTEND_PORT=3000 +OPENAI_API_BASE_URL=http://gemma3-vllm:8000/v1 +VLLM_API_KEY=YOUR_LOCAL_API_KEY_HERE +GEMMA_MODEL_ID=google/gemma-3-1b-it +OPEN_WEBUI_DATA_DIR=./frontend/data/open-webui diff --git a/scripts/install.sh b/scripts/install.sh new file mode 100755 index 0000000..ce35fa0 --- /dev/null +++ b/scripts/install.sh @@ -0,0 +1,155 @@ +#!/usr/bin/env bash +# Installs prerequisites (if needed), prepares config files, and starts Gemma 3 + vLLM + chat UI stack. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +log() { + printf '[install] %s +' "$*" +} + +err() { + printf '[install][error] %s +' "$*" >&2 +} + +require_linux() { + if [[ "$(uname -s)" != "Linux" ]]; then + err "This script supports Linux only." + exit 1 + fi +} + +install_docker_ubuntu() { + log "Installing Docker Engine and Compose plugin using official Docker apt repository." + sudo apt-get update + sudo apt-get install -y ca-certificates curl gnupg + sudo install -m 0755 -d /etc/apt/keyrings + + if [[ ! -f /etc/apt/keyrings/docker.gpg ]]; then + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg + sudo chmod a+r /etc/apt/keyrings/docker.gpg + fi + + source /etc/os-release + local arch + arch="$(dpkg --print-architecture)" + local codename + codename="${VERSION_CODENAME:-jammy}" + + echo "deb [arch=${arch} signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu ${codename} stable" | sudo tee /etc/apt/sources.list.d/docker.list >/dev/null + + sudo apt-get update + sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + + if ! sudo systemctl is-active --quiet docker; then + sudo systemctl enable --now docker + fi + + if ! getent group docker >/dev/null; then + sudo groupadd docker + fi + + if ! id -nG "${USER}" | grep -qw docker; then + sudo usermod -aG docker "${USER}" + log "Added ${USER} to docker group. You may need to log out and back in for group membership to apply." + fi +} + +check_or_install_docker() { + local have_docker=1 + local have_compose=1 + + if ! command -v docker >/dev/null 2>&1; then + have_docker=0 + fi + + if ! docker compose version >/dev/null 2>&1; then + have_compose=0 + fi + + if [[ ${have_docker} -eq 1 && ${have_compose} -eq 1 ]]; then + log "Docker and Compose plugin are already available." + return + fi + + if [[ -f /etc/os-release ]]; then + source /etc/os-release + if [[ "${ID:-}" == "ubuntu" ]]; then + install_docker_ubuntu + return + fi + fi + + err "Docker/Compose missing and automatic installation is implemented for Ubuntu only." + err "See docs/TROUBLESHOOTING.md#docker-and-compose-not-available" + exit 1 +} + +prepare_env_files() { + if [[ ! -f "${REPO_ROOT}/.env" ]]; then + cp "${REPO_ROOT}/.env.example" "${REPO_ROOT}/.env" + log "Created .env from .env.example." + err "IMPORTANT: edit .env and set HF_TOKEN (and optionally VLLM_API_KEY) before production use." + fi + + if [[ ! -f "${REPO_ROOT}/backend/config/model.env" ]]; then + cp "${REPO_ROOT}/backend/config/model.env.example" "${REPO_ROOT}/backend/config/model.env" + log "Created backend/config/model.env from example." + fi + + if [[ ! -f "${REPO_ROOT}/frontend/config/frontend.env" ]]; then + cp "${REPO_ROOT}/frontend/config/frontend.env.example" "${REPO_ROOT}/frontend/config/frontend.env" + log "Created frontend/config/frontend.env from example." + fi + + mkdir -p "${REPO_ROOT}/models" "${REPO_ROOT}/frontend/data/open-webui" +} + +warn_if_rocm_devices_missing() { + if [[ ! -e /dev/kfd || ! -d /dev/dri ]]; then + err "ROCm device files /dev/kfd or /dev/dri are not available." + err "See docs/TROUBLESHOOTING.md#rocm-devices-not-visible-in-host" + fi +} + +start_stack() { + log "Pulling container images." + docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${REPO_ROOT}/.env" pull + + log "Starting containers in detached mode." + docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${REPO_ROOT}/.env" up -d +} + +show_status_and_urls() { + local backend_port frontend_port + backend_port="$(grep -E '^BACKEND_PORT=' "${REPO_ROOT}/.env" | tail -n1 | cut -d'=' -f2 || true)" + frontend_port="$(grep -E '^FRONTEND_PORT=' "${REPO_ROOT}/.env" | tail -n1 | cut -d'=' -f2 || true)" + backend_port="${backend_port:-8000}" + frontend_port="${frontend_port:-3000}" + + log "Backend status:" + docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${REPO_ROOT}/.env" ps gemma3-vllm || true + + log "Frontend status:" + docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${REPO_ROOT}/.env" ps chat-ui || true + + printf ' +' + log "API endpoint: http://localhost:${backend_port}/v1" + log "Chat UI endpoint: http://localhost:${frontend_port}" + log "If startup fails, inspect logs with: docker compose logs --tail=200 gemma3-vllm chat-ui" +} + +main() { + require_linux + check_or_install_docker + prepare_env_files + warn_if_rocm_devices_missing + start_stack + show_status_and_urls +} + +main "$@" diff --git a/scripts/restart.sh b/scripts/restart.sh new file mode 100755 index 0000000..4a98b48 --- /dev/null +++ b/scripts/restart.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Restarts the Gemma 3 vLLM stack and shows service status. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +ENV_FILE="${REPO_ROOT}/.env" + +log() { + printf '[restart] %s +' "$*" +} + +if [[ ! -f "${ENV_FILE}" ]]; then + ENV_FILE="${REPO_ROOT}/.env.example" +fi + +log "Stopping stack." +docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${ENV_FILE}" down + +log "Starting stack." +docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${ENV_FILE}" up -d + +log "Current status:" +docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${ENV_FILE}" ps diff --git a/scripts/test_api.sh b/scripts/test_api.sh new file mode 100755 index 0000000..196d19d --- /dev/null +++ b/scripts/test_api.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# Tests local vLLM OpenAI-compatible API using curl and validates response shape. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +ENV_FILE="${REPO_ROOT}/.env" + +if [[ ! -f "${ENV_FILE}" ]]; then + echo "[test_api][error] .env file not found. Copy .env.example to .env first." >&2 + exit 1 +fi + +# shellcheck disable=SC1090 +source "${ENV_FILE}" + +BACKEND_PORT="${BACKEND_PORT:-8000}" +GEMMA_MODEL_ID="${GEMMA_MODEL_ID:-google/gemma-3-1b-it}" +VLLM_API_KEY="${VLLM_API_KEY:-EMPTY}" +API_URL="http://localhost:${BACKEND_PORT}/v1/chat/completions" + +payload_file="$(mktemp)" +response_file="$(mktemp)" +trap 'rm -f "${payload_file}" "${response_file}"' EXIT + +cat > "${payload_file}" <&2 + cat "${response_file}" >&2 + echo "[test_api][hint] See docs/TROUBLESHOOTING.md#vllm-container-exits-or-fails-healthchecks" >&2 + exit 1 +fi + +if ! grep -q '"choices"' "${response_file}"; then + echo "[test_api][error] API response did not include expected 'choices' field." >&2 + cat "${response_file}" >&2 + exit 1 +fi + +echo "[test_api] Success. API responded with expected structure." +cat "${response_file}" diff --git a/scripts/test_python_client.py b/scripts/test_python_client.py new file mode 100755 index 0000000..66cfe2f --- /dev/null +++ b/scripts/test_python_client.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +"""Tests local vLLM OpenAI-compatible API using openai>=1.x Python client.""" + +from __future__ import annotations + +import os +import sys +from pathlib import Path + + +def load_dotenv(dotenv_path: Path) -> None: + if not dotenv_path.exists(): + return + + for raw_line in dotenv_path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, value = line.split("=", 1) + key = key.strip() + value = value.strip().strip('"').strip("'") + os.environ.setdefault(key, value) + + +def main() -> int: + repo_root = Path(__file__).resolve().parent.parent + load_dotenv(repo_root / ".env") + + backend_port = os.getenv("BACKEND_PORT", "8000") + model_id = os.getenv("GEMMA_MODEL_ID", "google/gemma-3-1b-it") + api_key = os.getenv("VLLM_API_KEY", "EMPTY") + base_url = f"http://localhost:{backend_port}/v1" + + try: + from openai import OpenAI + except ImportError: + print("[test_python_client][error] openai package is not installed.", file=sys.stderr) + print("Install it with: python3 -m pip install openai", file=sys.stderr) + return 1 + + client = OpenAI(api_key=api_key, base_url=base_url) + + try: + response = client.chat.completions.create( + model=model_id, + messages=[ + {"role": "system", "content": "You are a concise assistant."}, + { + "role": "user", + "content": "Say hello from Gemma 3 running on vLLM in one sentence.", + }, + ], + temperature=0.2, + max_tokens=64, + ) + except Exception as exc: + print(f"[test_python_client][error] Request failed: {exc}", file=sys.stderr) + print( + "[test_python_client][hint] See docs/TROUBLESHOOTING.md#vllm-container-exits-or-fails-healthchecks", + file=sys.stderr, + ) + return 1 + + if not response.choices or not response.choices[0].message: + print("[test_python_client][error] No completion choices returned.", file=sys.stderr) + return 1 + + content = response.choices[0].message.content or "" + print("[test_python_client] Success. Assistant response:") + print(content.strip()) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/test_ui.sh b/scripts/test_ui.sh new file mode 100755 index 0000000..efb035e --- /dev/null +++ b/scripts/test_ui.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Tests whether the chat UI is reachable on localhost frontend port. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +ENV_FILE="${REPO_ROOT}/.env" + +if [[ -f "${ENV_FILE}" ]]; then + # shellcheck disable=SC1090 + source "${ENV_FILE}" +fi + +FRONTEND_PORT="${FRONTEND_PORT:-3000}" +UI_URL="http://localhost:${FRONTEND_PORT}" + +http_status="$(curl -sS -o /dev/null -w '%{http_code}' "${UI_URL}")" + +if [[ "${http_status}" != "200" && "${http_status}" != "301" && "${http_status}" != "302" ]]; then + echo "[test_ui][error] UI check failed with HTTP status ${http_status} at ${UI_URL}" >&2 + echo "[test_ui][hint] See docs/TROUBLESHOOTING.md#ui-loads-but-cannot-reach-vllm-backend" >&2 + exit 1 +fi + +echo "[test_ui] Chat UI is reachable at ${UI_URL} (HTTP ${http_status})." diff --git a/scripts/uninstall.sh b/scripts/uninstall.sh new file mode 100755 index 0000000..736fa8e --- /dev/null +++ b/scripts/uninstall.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +# Stops and removes the Gemma 3 vLLM stack. Optional --purge removes local model/cache data. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +PURGE=0 + +log() { + printf '[uninstall] %s +' "$*" +} + +err() { + printf '[uninstall][error] %s +' "$*" >&2 +} + +usage() { + cat <<'EOF' +Usage: scripts/uninstall.sh [--purge] + +Options: + --purge Remove local Hugging Face cache directory and ./models data in addition to containers/volumes. + -h, --help Show this help message. +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --purge) + PURGE=1 + ;; + -h|--help) + usage + exit 0 + ;; + *) + err "Unknown argument: $1" + usage + exit 1 + ;; + esac + shift +done + +if [[ ! -f "${REPO_ROOT}/docker-compose.yml" ]]; then + err "docker-compose.yml not found at ${REPO_ROOT}." + exit 1 +fi + +ENV_FILE="${REPO_ROOT}/.env" +if [[ ! -f "${ENV_FILE}" ]]; then + ENV_FILE="${REPO_ROOT}/.env.example" +fi + +log "Stopping stack and removing containers, networks, and named/anonymous volumes." +docker compose -f "${REPO_ROOT}/docker-compose.yml" --env-file "${ENV_FILE}" down -v || true + +if [[ ${PURGE} -eq 1 ]]; then + log "Purge requested. Removing local data directories used by this stack." + + huggingface_cache_dir="$(grep -E '^HUGGINGFACE_CACHE_DIR=' "${ENV_FILE}" | tail -n1 | cut -d'=' -f2- || true)" + open_webui_data_dir="$(grep -E '^OPEN_WEBUI_DATA_DIR=' "${ENV_FILE}" | tail -n1 | cut -d'=' -f2- || true)" + + if [[ -n "${huggingface_cache_dir}" ]]; then + # Expand potential variables such as ${USER} + evaluated_hf_dir="$(eval "printf '%s' "${huggingface_cache_dir}"")" + if [[ -d "${evaluated_hf_dir}" ]]; then + log "Removing Hugging Face cache directory: ${evaluated_hf_dir}" + rm -rf "${evaluated_hf_dir}" + else + log "Hugging Face cache directory not found: ${evaluated_hf_dir}" + fi + fi + + if [[ -z "${open_webui_data_dir}" ]]; then + open_webui_data_dir="./frontend/data/open-webui" + fi + + if [[ "${open_webui_data_dir}" == ./* ]]; then + open_webui_data_dir="${REPO_ROOT}/${open_webui_data_dir#./}" + fi + + if [[ -d "${open_webui_data_dir}" ]]; then + log "Removing Open WebUI data directory: ${open_webui_data_dir}" + rm -rf "${open_webui_data_dir}" + fi + + if [[ -d "${REPO_ROOT}/models" ]]; then + log "Removing local models directory: ${REPO_ROOT}/models" + rm -rf "${REPO_ROOT}/models" + fi +else + log "Safe mode enabled (default). Local model/cache data was preserved." +fi + +log "Uninstall complete."