changeset 332:d4893670f888 default tip

WIP: use watchdog reboot timer on pi
author drewp@bigasterisk.com
date Thu, 27 Feb 2025 11:09:29 -0800
parents 50a8b6c39b38
children
files pi-watchdog/Dockerfile pi-watchdog/deploy.yaml pi-watchdog/readme pi-watchdog/rootfs-check.sh pi-watchdog/run-watchdog.sh pi-watchdog/tasks.py pi-watchdog/watchdog.conf
diffstat 7 files changed, 206 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pi-watchdog/Dockerfile	Thu Feb 27 11:09:29 2025 -0800
@@ -0,0 +1,20 @@
+FROM alpine:latest
+
+# Enable testing repository, update, upgrade, and install watchdog
+RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/testing" >> /etc/apk/repositories \
+    && apk update \
+    && apk upgrade -U \
+    && apk add watchdog \
+    && apk add --no-cache util-linux
+
+COPY watchdog.conf /etc/watchdog.conf
+COPY rootfs-check.sh /usr/local/bin/rootfs-check.sh
+COPY run-watchdog.sh /run-watchdog.sh
+
+RUN chmod +x /usr/local/bin/rootfs-check.sh \
+    && chmod +x /run-watchdog.sh
+
+# Expose port for reboot signal (optional)
+EXPOSE 8080
+
+CMD ["/run-watchdog.sh"]
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pi-watchdog/deploy.yaml	Thu Feb 27 11:09:29 2025 -0800
@@ -0,0 +1,34 @@
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: pi-watchdog
+  namespace: default
+  labels:
+    app: pi-watchdog
+spec:
+  selector:
+    matchLabels:
+      app: pi-watchdog
+  template:
+    metadata:
+      labels:
+        app: pi-watchdog
+    spec:
+      volumes:
+        - name: dev
+          hostPath:
+            path: /dev
+      containers:
+        - name: watchdog
+          image: reg:5000/pi_watchdog
+          imagePullPolicy: Always
+          securityContext:
+            runAsUser: 0
+            privileged: true
+          volumeMounts:
+            - name: dev
+              mountPath: /dev
+      nodeSelector:
+          # ought to be "only hosts that use netboot/iscsi"
+          kubernetes.io/arch: arm64
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pi-watchdog/readme	Thu Feb 27 11:09:29 2025 -0800
@@ -0,0 +1,1 @@
+k8s daemonset for setting up pi watchdog reboot timers
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pi-watchdog/rootfs-check.sh	Thu Feb 27 11:09:29 2025 -0800
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+echo "Starting rootfs-check.sh script"
+
+# check rootfs share is mounted
+if ! findmnt /; then
+  echo "rootfs share is not mounted"
+  exit 1
+fi
+
+# check rootfs share is accessible
+if ! ls / > /dev/null 2>&1; then
+  echo "rootfs share is not accessible"
+  exit 1
+fi
+
+echo "v1 rootfs share is OK"
+exit 0
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pi-watchdog/run-watchdog.sh	Thu Feb 27 11:09:29 2025 -0800
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+# start watchdog service
+watchdog -vF
+
+# while true; do
+#   /usr/local/bin/rootfs-check.sh
+#   sleep 10
+# done
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pi-watchdog/tasks.py	Thu Feb 27 11:09:29 2025 -0800
@@ -0,0 +1,24 @@
+import datetime
+
+from invoke import task
+
+# todo- see https://github.com/GoogleContainerTools/skaffold/issues/9543 for one obstacle
+
+now = datetime.datetime.now().replace(microsecond=0)
+now = now.isoformat().replace(':', '_').replace('-', '_')
+base = 'pi_watchdog'
+localImage = f'{base}:{now}'
+remoteImage = f'reg:5000/{base}:latest'
+
+
+@task
+def build(ctx):  # workaround for skaffold build
+    ctx.run(f'podman manifest create {localImage}', echo=True)
+    ctx.run(f'podman build --platform linux/arm64 --manifest {localImage} -f Dockerfile .', echo=True)
+    ctx.run(f'podman manifest push {localImage} {remoteImage}', echo=True)
+    print(f'{remoteImage} ready for use')
+
+
+@task(pre=[build])
+def run(ctx):
+    ctx.run('kubectl apply -f deploy.yaml', echo=True)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pi-watchdog/watchdog.conf	Thu Feb 27 11:09:29 2025 -0800
@@ -0,0 +1,100 @@
+# ====================================================================
+# Configuration for the watchdog daemon. For more information on the
+# parameters in this file use the command 'man watchdog.conf'
+# ====================================================================
+
+# =================== The hardware timer settings ====================
+#
+# Or work your way through the modules listed under:
+#
+# /lib/modules/`uname -r`/kernel/drivers/watchdog/
+#
+# To see if they load, present /dev/watchdog, and are capable of
+# resetting the system on time-out.
+
+watchdog-device    = /dev/watchdog
+
+# Uncomment and edit this line for hardware timeout values that differ
+# from the default of one minute.
+
+#watchdog-timeout = 60
+
+# If your watchdog trips by itself when the first timeout interval
+# elapses then try uncommenting the line below and changing the
+# value to 'yes'.
+
+#watchdog-refresh-use-settimeout  = auto
+
+
+# ====================== Other system settings ========================
+#
+# Interval between tests. Should be a couple of seconds shorter than
+# the hardware time-out value.
+
+#interval   = 1
+
+# The number of intervals skipped before a log message is written (i.e.
+# a multiplier for 'interval' in terms of syslog messages)
+
+#logtick        = 1
+
+# Directory for log files (probably best not to change this)
+
+#log-dir    = /var/log/watchdog
+
+
+# Lock the daemon in to memory as a real-time process. This greatly
+# decreases the chance that watchdog won't be scheduled before your
+# machine is really loaded.
+
+realtime    = yes
+priority    = 1
+
+# ====================== How to handle errors  =======================
+#
+# If you have a custom binary/script to handle errors then uncomment
+# this line and provide the path. For 'v1' test binary files they also
+# handle error cases.
+
+# repair-binary    = 
+#repair-timeout   = 60
+
+# The retry-timeout and repair limit are used to handle errors in a
+# more robust manner. Errors must persist for longer than this to
+# action a repair or reboot, and if repair-maximum attempts are
+# made without the test passing a reboot is initiated anyway.
+
+retry-timeout   = 100
+repair-maximum   = 100
+
+# ====================== User-specified tests ========================
+#
+# Specify the directory for auto-added 'v1' test programs (any executable
+# found in the 'test-directory should be listed).
+
+#test-directory = /etc/watchdog.d
+
+# Specify any v0 custom tests here. Multiple lines are permitted, but
+# having any 'v1' programs/scripts discovered in the 'test-directory' is
+# the better way.
+
+test-binary = /usr/local/bin/rootfs-check.sh
+
+# Specify the time-out value for a test error to be reported.
+
+test-timeout    = 100
+
+# ====================== Typical tests ===============================
+#
+
+# Uncomment to enable load average tests for 1, 5 and 15 minute
+# averages. Setting one of these values to '0' disables it. These
+# values will hopefully never reboot your machine during normal use
+# (if your machine is really hung, the loadavg will go much higher
+# than 25 in most cases).
+
+max-load-1   = 24
+#max-load-5   = 18
+#max-load-15    = 12
+
+watchdog-device = /dev/watchdog0
\ No newline at end of file