diff --git a/.forgejo/workflows/build-iso.yml b/.forgejo/workflows/build-iso.yml index 30b3045..46b0704 100644 --- a/.forgejo/workflows/build-iso.yml +++ b/.forgejo/workflows/build-iso.yml @@ -50,3 +50,34 @@ jobs: path: iso/out/*.iso retention-days: 14 if-no-files-found: error + + smoke-vm: + # Boot the freshly built ISO in a VM on the .165 Proxmox test host and + # check the webinstaller responds on :5000. Shares the runner workspace + # with build-iso via `needs:` so we skip the artifact round-trip. + # `continue-on-error: true` so a VM-side flake doesn't mark the ISO + # build red — the ISO itself is still valid and uploaded. + needs: build-iso + runs-on: self-hosted + continue-on-error: true + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + + - name: Re-download ISO into iso/out + # `needs:` doesn't preserve the workspace across jobs on Forgejo + # host-mode runners, so pull the artifact we just uploaded. + uses: actions/download-artifact@v3 + with: + name: furtka-iso + path: iso/out + + - name: Smoke-test ISO on Proxmox test host + env: + PVE_TEST_HOST: ${{ secrets.PVE_TEST_HOST }} + PVE_TEST_TOKEN: ${{ secrets.PVE_TEST_TOKEN }} + SMOKE_SHA: ${{ github.sha }} + run: | + iso=$(ls iso/out/*.iso | head -1) + echo "Smoking $iso" + ./scripts/smoke-vm.sh "$iso" diff --git a/docs/smoke-vm.md b/docs/smoke-vm.md new file mode 100644 index 0000000..74fbe82 --- /dev/null +++ b/docs/smoke-vm.md @@ -0,0 +1,106 @@ +# Smoke VM on Proxmox Test Host + +Every push to `main` builds a fresh ISO (`build-iso.yml`) and then boots +it in a throwaway VM on the Proxmox test host — currently +`192.168.178.165` — to confirm the live ISO boots and the webinstaller +responds on `:5000`. If the smoke step fails, the ISO artifact is still +uploaded and the VM is left running for post-mortem. + +The heavy lifting lives in [`scripts/smoke-vm.sh`](../scripts/smoke-vm.sh); +the workflow just downloads the artifact and shells out. + +## Where smoke VMs live + +- Node: whatever the test host reports as its node name (auto-detected) +- VMID range: `9000–9099` (`PVE_TEST_VMID_MIN` / `PVE_TEST_VMID_MAX`) +- Name: `furtka-smoke-<12-char-sha>` +- Tags: `furtka`, `smoke`, `sha-<12-char-sha>` +- MAC: `BC:24:11:` (Proxmox's OUI; lets the runner + find the VM by scanning the LAN — the live ISO has no guest agent) +- ISO on test host: `local:iso/furtka-.iso` + +Five most recent VMs (and their ISOs) are kept; anything older is stopped +and purged (`destroy-unreferenced-disks=1`) on the next run. Tune via +`PVE_TEST_KEEP`. + +## Poking a failed smoke VM + +1. Find it in the Proxmox WebUI — look for `furtka-smoke-` in the + 9000-range. The VM is still running. +2. Console: **Console** tab in the WebUI (SPICE or noVNC). The webinstaller + logs to `journalctl -u furtka-webinstaller.service` on the live ISO. +3. SSH: the live Arch ISO ships `sshd` enabled with no root password. + Normally SSH as a LAN-reachable user is not possible without creds — + use the WebUI console instead. (The **installed** system, post-wizard, + has the `server` user with the password the wizard set.) +4. Fetch the short-sha from the VM name → cross-reference against + `git log` to see exactly which commit built the failing ISO. + +## Running a smoke test locally + +Needs LAN access to the test Proxmox and an API token with VM perms. + +```bash +PVE_TEST_HOST=192.168.178.165 \ +PVE_TEST_TOKEN='user@pve!smoke=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx' \ +./scripts/smoke-vm.sh iso/out/furtka-*.iso +``` + +The script exits 0 on success, non-zero if the VM never served +`http://:5000`. Pruning runs either way. + +## Clearing the 9000-range by hand + +If smoke tests wedge or you want a clean slate: + +```bash +# List smoke VMs +curl -sSk -H "Authorization: PVEAPIToken=${PVE_TEST_TOKEN}" \ + https://192.168.178.165:8006/api2/json/nodes//qemu \ + | python3 -c 'import json,sys; [print(v["vmid"],v["name"]) for v in json.load(sys.stdin)["data"] if 9000<=int(v["vmid"])<=9099]' + +# Destroy one +curl -sSk -X POST -H "Authorization: PVEAPIToken=${PVE_TEST_TOKEN}" \ + https://192.168.178.165:8006/api2/json/nodes//qemu//status/stop +curl -sSk -X DELETE -H "Authorization: PVEAPIToken=${PVE_TEST_TOKEN}" \ + "https://192.168.178.165:8006/api2/json/nodes//qemu/?purge=1&destroy-unreferenced-disks=1" +``` + +Or just run `scripts/smoke-vm.sh` with `PVE_TEST_KEEP=0` and any ISO — +the prune step will sweep everything in the range except the one it +just created. + +## Proxmox API token setup (one-time) + +1. WebUI → **Datacenter → Permissions → API Tokens → Add** +2. User: `root@pam` (or a dedicated `smoke@pve` user — see below) +3. Token ID: `smoke` +4. Uncheck **Privilege Separation** for the quick path, or keep it + separated and grant explicit perms below +5. Save the displayed secret once — it's shown only here + +Minimum perms on `/` (if privilege-separated): +`VM.Allocate`, `VM.Config.Disk`, `VM.Config.CPU`, `VM.Config.Memory`, +`VM.Config.Network`, `VM.Config.Options`, `VM.Config.HWType`, +`VM.Config.CDROM`, `VM.PowerMgmt`, `VM.Audit`, `Datastore.AllocateTemplate` +(for ISO upload/delete on the `local` content store). + +Set the result as Forgejo secret `PVE_TEST_TOKEN` in the format: + +``` +user@realm!tokenid=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx +``` + +…and `PVE_TEST_HOST` as `192.168.178.165`. That's all the workflow needs. + +## Assumptions + +- Runner has L2 reachability to `192.168.178.0/24` (MAC→IP discovery + uses `arp-scan` from the runner). +- Test host uses default storage names: `local` for ISOs, `local-lvm` for + disks. Override via `PVE_TEST_ISO_STORAGE` / `PVE_TEST_DISK_STORAGE`. +- Bridge `vmbr0` carries LAN DHCP. Override via `PVE_TEST_BRIDGE`. + +If any of those don't match, set the corresponding env var in +`build-iso.yml` (via `env:` on the smoke step) or override on the CLI +when running locally. diff --git a/ops/forgejo-runner/bootstrap.sh b/ops/forgejo-runner/bootstrap.sh index 133719c..02baaef 100755 --- a/ops/forgejo-runner/bootstrap.sh +++ b/ops/forgejo-runner/bootstrap.sh @@ -12,7 +12,10 @@ fi echo "==> Updating apt and installing prerequisites" sudo apt-get update -y -sudo apt-get install -y ca-certificates curl gnupg +# arp-scan + iputils: needed by scripts/smoke-vm.sh for MAC→IP discovery +# of the test VM on the Proxmox test host (live ISO has no guest agent, +# so we scan the LAN and match on the MAC we assigned at VM creation). +sudo apt-get install -y ca-certificates curl gnupg arp-scan iputils-arping echo "==> Adding Docker's official GPG key" sudo install -m 0755 -d /etc/apt/keyrings diff --git a/scripts/smoke-vm.sh b/scripts/smoke-vm.sh new file mode 100755 index 0000000..a05527f --- /dev/null +++ b/scripts/smoke-vm.sh @@ -0,0 +1,214 @@ +#!/usr/bin/env bash +# Smoke-test a freshly built Furtka live ISO by booting it in a VM on the +# Proxmox test host (defaults to $PVE_TEST_HOST) and checking that the +# webinstaller answers HTTP 200 on :5000. +# +# Usage: ./scripts/smoke-vm.sh +# +# Required env: +# PVE_TEST_HOST IP/hostname of the test node (e.g. 192.168.178.165) +# PVE_TEST_TOKEN "user@realm!tokenid=secret" single string +# +# Optional env: +# PVE_TEST_NODE PVE node name; auto-detected from /nodes if empty +# PVE_TEST_ISO_STORAGE default "local" +# PVE_TEST_DISK_STORAGE default "local-lvm" +# PVE_TEST_BRIDGE default "vmbr0" +# PVE_TEST_VMID_MIN default 9000 +# PVE_TEST_VMID_MAX default 9099 +# PVE_TEST_KEEP how many past smoke VMs to retain (default 5) +# PVE_TEST_BOOT_TIMEOUT seconds to wait for :5000 (default 180) +# SMOKE_SHA commit SHA used in name/tag/MAC; defaults to git HEAD +# +# Exits 0 iff the ISO booted and :5000 returned 200. Prunes old VMs + ISOs +# after the test regardless of outcome so a failed build's VM stays behind +# for post-mortem (at the cost of the run before it). +set -euo pipefail + +ISO_PATH="${1:?usage: $0 }" +[[ -f "$ISO_PATH" ]] || { echo "iso not found: $ISO_PATH" >&2; exit 1; } + +: "${PVE_TEST_HOST:?PVE_TEST_HOST must be set}" +: "${PVE_TEST_TOKEN:?PVE_TEST_TOKEN must be set}" +ISO_STORAGE="${PVE_TEST_ISO_STORAGE:-local}" +DISK_STORAGE="${PVE_TEST_DISK_STORAGE:-local-lvm}" +BRIDGE="${PVE_TEST_BRIDGE:-vmbr0}" +VMID_MIN="${PVE_TEST_VMID_MIN:-9000}" +VMID_MAX="${PVE_TEST_VMID_MAX:-9099}" +KEEP="${PVE_TEST_KEEP:-5}" +BOOT_TIMEOUT="${PVE_TEST_BOOT_TIMEOUT:-180}" +SHA="${SMOKE_SHA:-$(git rev-parse HEAD 2>/dev/null || echo unknownunknown)}" +SHORT_SHA="${SHA:0:12}" + +API="https://${PVE_TEST_HOST}:8006/api2/json" + +api() { + curl --silent --show-error --fail-with-body -k \ + --header "Authorization: PVEAPIToken=${PVE_TEST_TOKEN}" \ + "$@" +} + +# PVE returns {"data": }; grab .data into a python expression. +jget() { + python3 -c 'import json,sys; print(json.load(sys.stdin)["data"])' +} + +# Auto-detect node name if not given: first entry from /nodes. +NODE="${PVE_TEST_NODE:-}" +if [[ -z "$NODE" ]]; then + NODE="$(api "$API/nodes" | python3 -c ' +import json, sys +nodes = json.load(sys.stdin)["data"] +if not nodes: + sys.exit("no nodes returned from PVE") +print(nodes[0]["node"]) +')" +fi +echo "==> node=$NODE sha=$SHORT_SHA iso=$(basename "$ISO_PATH")" + +ISO_NAME="furtka-${SHORT_SHA}.iso" +VOLID="${ISO_STORAGE}:iso/${ISO_NAME}" + +# --- Step 1: upload ISO ---------------------------------------------------- +# PVE's upload endpoint errors if the file exists. Delete first so re-runs +# on the same SHA (e.g. workflow re-dispatch) work. +if api "$API/nodes/$NODE/storage/$ISO_STORAGE/content/$VOLID" \ + --output /dev/null 2>/dev/null; then + echo "==> removing stale ISO $VOLID" + api --request DELETE "$API/nodes/$NODE/storage/$ISO_STORAGE/content/$VOLID" \ + --output /dev/null +fi + +echo "==> uploading ISO as $ISO_NAME" +api --request POST "$API/nodes/$NODE/storage/$ISO_STORAGE/upload" \ + --form "content=iso" \ + --form "filename=@${ISO_PATH};filename=${ISO_NAME}" \ + > /dev/null + +# --- Step 2: pick a free VMID in the reserved range ------------------------ +# List VMs on the node, filter by range, pick the lowest integer not in use. +USED="$(api "$API/nodes/$NODE/qemu" | python3 -c ' +import json, sys +data = json.load(sys.stdin)["data"] +print(" ".join(str(v["vmid"]) for v in data)) +')" + +VMID="" +for ((id = VMID_MIN; id <= VMID_MAX; id++)); do + if ! [[ " $USED " == *" $id "* ]]; then + VMID="$id" + break + fi +done +[[ -n "$VMID" ]] || { echo "no free VMID in ${VMID_MIN}-${VMID_MAX}" >&2; exit 1; } + +# Derive a stable MAC from the SHA. BC:24:11 is Proxmox's assigned OUI. +MAC_TAIL="$(echo "$SHORT_SHA" | tr 'a-z' 'A-Z' | cut -c1-6)" +MAC="BC:24:11:${MAC_TAIL:0:2}:${MAC_TAIL:2:2}:${MAC_TAIL:4:2}" + +echo "==> creating VM $VMID name=furtka-smoke-${SHORT_SHA} mac=$MAC" +api --request POST "$API/nodes/$NODE/qemu" \ + --data-urlencode "vmid=$VMID" \ + --data-urlencode "name=furtka-smoke-${SHORT_SHA}" \ + --data-urlencode "tags=furtka;smoke;sha-${SHORT_SHA}" \ + --data-urlencode "cores=2" \ + --data-urlencode "memory=4096" \ + --data-urlencode "bios=ovmf" \ + --data-urlencode "machine=q35" \ + --data-urlencode "ostype=l26" \ + --data-urlencode "scsihw=virtio-scsi-single" \ + --data-urlencode "efidisk0=${DISK_STORAGE}:1,efitype=4m,pre-enrolled-keys=0" \ + --data-urlencode "scsi0=${DISK_STORAGE}:20,discard=on,ssd=1" \ + --data-urlencode "ide2=${VOLID},media=cdrom" \ + --data-urlencode "boot=order=ide2;scsi0" \ + --data-urlencode "net0=virtio=${MAC},bridge=${BRIDGE},firewall=0" \ + > /dev/null + +echo "==> starting VM $VMID" +api --request POST "$API/nodes/$NODE/qemu/$VMID/status/start" > /dev/null + +# --- Step 3: discover the VM's IP by MAC ----------------------------------- +# The live ISO has no qemu-guest-agent, so PVE can't tell us the IP. +# We scan the LAN from the runner and match on our derived MAC. +MAC_LOWER="$(echo "$MAC" | tr 'A-Z' 'a-z')" +IP="" +deadline=$((SECONDS + 150)) +while (( SECONDS < deadline )); do + if command -v arp-scan >/dev/null 2>&1; then + IP="$(sudo arp-scan --localnet --quiet --ignoredups 2>/dev/null \ + | awk -v m="$MAC_LOWER" 'tolower($2) == m { print $1; exit }')" + fi + if [[ -z "$IP" ]] && command -v nmap >/dev/null 2>&1; then + sudo nmap -sn -T4 192.168.178.0/24 >/dev/null 2>&1 || true + IP="$(ip neigh show \ + | awk -v m="$MAC_LOWER" 'tolower($5) == m && $1 ~ /^[0-9]/ { print $1; exit }')" + fi + [[ -n "$IP" ]] && break + sleep 5 +done +if [[ -z "$IP" ]]; then + echo "!! never saw $MAC on the LAN within 150s" >&2 + SMOKE_RC=1 +else + echo "==> VM $VMID is at $IP (mac $MAC)" +fi + +# --- Step 4: smoke the webinstaller ---------------------------------------- +SMOKE_RC="${SMOKE_RC:-0}" +if [[ "$SMOKE_RC" -eq 0 ]]; then + echo "==> polling http://${IP}:5000 (timeout ${BOOT_TIMEOUT}s)" + end=$((SECONDS + BOOT_TIMEOUT)) + while (( SECONDS < end )); do + if curl --silent --fail --max-time 5 --output /dev/null "http://${IP}:5000/"; then + echo "==> :5000 answered 200 — smoke passed" + SMOKE_RC=0 + break + fi + SMOKE_RC=1 + sleep 5 + done + if [[ "$SMOKE_RC" -ne 0 ]]; then + echo "!! :5000 never returned 200 on ${IP}" >&2 + fi +fi + +# --- Step 5: prune old smoke VMs + ISOs ------------------------------------ +echo "==> pruning smoke VMs, keeping last $KEEP" +# List VMs in the reserved range sorted by vmid desc; drop the first KEEP. +TO_DROP="$(api "$API/nodes/$NODE/qemu" | python3 -c " +import json, sys +lo, hi, keep = ${VMID_MIN}, ${VMID_MAX}, ${KEEP} +vms = [v for v in json.load(sys.stdin)['data'] + if lo <= int(v['vmid']) <= hi] +vms.sort(key=lambda v: int(v['vmid']), reverse=True) +for v in vms[keep:]: + print(v['vmid']) +")" + +for old in $TO_DROP; do + echo " dropping VM $old" + # Find the ISO the VM was booted from so we can delete it after. + OLD_ISO="$(api "$API/nodes/$NODE/qemu/$old/config" | python3 -c ' +import json, sys, re +cfg = json.load(sys.stdin)["data"] +for k in ("ide0","ide1","ide2","ide3","sata0","sata1","sata2","sata3"): + v = cfg.get(k,"") + m = re.match(r"([^,]+),.*media=cdrom", v) + if m and m.group(1).endswith(".iso"): + print(m.group(1)); break +' || true)" + # Stop (ignore errors if already stopped), then purge. + api --request POST "$API/nodes/$NODE/qemu/$old/status/stop" \ + --output /dev/null 2>/dev/null || true + # /qemu/ DELETE is async; the call returns a UPID but for our purposes + # "fire and forget" is fine — next prune will retry if it didn't land. + api --request DELETE "$API/nodes/$NODE/qemu/$old?purge=1&destroy-unreferenced-disks=1" \ + --output /dev/null || echo " (delete of $old failed; skipping)" + if [[ -n "$OLD_ISO" && "$OLD_ISO" != "$VOLID" ]]; then + echo " dropping ISO $OLD_ISO" + api --request DELETE "$API/nodes/$NODE/storage/$ISO_STORAGE/content/$OLD_ISO" \ + --output /dev/null 2>/dev/null || true + fi +done + +exit "$SMOKE_RC"