Mercurial > code > home > repos > infra
changeset 332:d4893670f888 default tip
WIP: use watchdog reboot timer on pi
author | drewp@bigasterisk.com |
---|---|
date | Thu, 27 Feb 2025 11:09:29 -0800 |
parents | 50a8b6c39b38 |
children | |
files | pi-watchdog/Dockerfile pi-watchdog/deploy.yaml pi-watchdog/readme pi-watchdog/rootfs-check.sh pi-watchdog/run-watchdog.sh pi-watchdog/tasks.py pi-watchdog/watchdog.conf |
diffstat | 7 files changed, 206 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pi-watchdog/Dockerfile Thu Feb 27 11:09:29 2025 -0800 @@ -0,0 +1,20 @@ +FROM alpine:latest + +# Enable testing repository, update, upgrade, and install watchdog +RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/testing" >> /etc/apk/repositories \ + && apk update \ + && apk upgrade -U \ + && apk add watchdog \ + && apk add --no-cache util-linux + +COPY watchdog.conf /etc/watchdog.conf +COPY rootfs-check.sh /usr/local/bin/rootfs-check.sh +COPY run-watchdog.sh /run-watchdog.sh + +RUN chmod +x /usr/local/bin/rootfs-check.sh \ + && chmod +x /run-watchdog.sh + +# Expose port for reboot signal (optional) +EXPOSE 8080 + +CMD ["/run-watchdog.sh"] \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pi-watchdog/deploy.yaml Thu Feb 27 11:09:29 2025 -0800 @@ -0,0 +1,34 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: pi-watchdog + namespace: default + labels: + app: pi-watchdog +spec: + selector: + matchLabels: + app: pi-watchdog + template: + metadata: + labels: + app: pi-watchdog + spec: + volumes: + - name: dev + hostPath: + path: /dev + containers: + - name: watchdog + image: reg:5000/pi_watchdog + imagePullPolicy: Always + securityContext: + runAsUser: 0 + privileged: true + volumeMounts: + - name: dev + mountPath: /dev + nodeSelector: + # ought to be "only hosts that use netboot/iscsi" + kubernetes.io/arch: arm64 \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pi-watchdog/readme Thu Feb 27 11:09:29 2025 -0800 @@ -0,0 +1,1 @@ +k8s daemonset for setting up pi watchdog reboot timers
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pi-watchdog/rootfs-check.sh Thu Feb 27 11:09:29 2025 -0800 @@ -0,0 +1,18 @@ +#!/bin/sh + +echo "Starting rootfs-check.sh script" + +# check rootfs share is mounted +if ! findmnt /; then + echo "rootfs share is not mounted" + exit 1 +fi + +# check rootfs share is accessible +if ! ls / > /dev/null 2>&1; then + echo "rootfs share is not accessible" + exit 1 +fi + +echo "v1 rootfs share is OK" +exit 0 \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pi-watchdog/run-watchdog.sh Thu Feb 27 11:09:29 2025 -0800 @@ -0,0 +1,9 @@ +#!/bin/sh + +# start watchdog service +watchdog -vF + +# while true; do +# /usr/local/bin/rootfs-check.sh +# sleep 10 +# done
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pi-watchdog/tasks.py Thu Feb 27 11:09:29 2025 -0800 @@ -0,0 +1,24 @@ +import datetime + +from invoke import task + +# todo- see https://github.com/GoogleContainerTools/skaffold/issues/9543 for one obstacle + +now = datetime.datetime.now().replace(microsecond=0) +now = now.isoformat().replace(':', '_').replace('-', '_') +base = 'pi_watchdog' +localImage = f'{base}:{now}' +remoteImage = f'reg:5000/{base}:latest' + + +@task +def build(ctx): # workaround for skaffold build + ctx.run(f'podman manifest create {localImage}', echo=True) + ctx.run(f'podman build --platform linux/arm64 --manifest {localImage} -f Dockerfile .', echo=True) + ctx.run(f'podman manifest push {localImage} {remoteImage}', echo=True) + print(f'{remoteImage} ready for use') + + +@task(pre=[build]) +def run(ctx): + ctx.run('kubectl apply -f deploy.yaml', echo=True)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pi-watchdog/watchdog.conf Thu Feb 27 11:09:29 2025 -0800 @@ -0,0 +1,100 @@ +# ==================================================================== +# Configuration for the watchdog daemon. For more information on the +# parameters in this file use the command 'man watchdog.conf' +# ==================================================================== + +# =================== The hardware timer settings ==================== +# +# Or work your way through the modules listed under: +# +# /lib/modules/`uname -r`/kernel/drivers/watchdog/ +# +# To see if they load, present /dev/watchdog, and are capable of +# resetting the system on time-out. + +watchdog-device = /dev/watchdog + +# Uncomment and edit this line for hardware timeout values that differ +# from the default of one minute. + +#watchdog-timeout = 60 + +# If your watchdog trips by itself when the first timeout interval +# elapses then try uncommenting the line below and changing the +# value to 'yes'. + +#watchdog-refresh-use-settimeout = auto + + +# ====================== Other system settings ======================== +# +# Interval between tests. Should be a couple of seconds shorter than +# the hardware time-out value. + +#interval = 1 + +# The number of intervals skipped before a log message is written (i.e. +# a multiplier for 'interval' in terms of syslog messages) + +#logtick = 1 + +# Directory for log files (probably best not to change this) + +#log-dir = /var/log/watchdog + + +# Lock the daemon in to memory as a real-time process. This greatly +# decreases the chance that watchdog won't be scheduled before your +# machine is really loaded. + +realtime = yes +priority = 1 + +# ====================== How to handle errors ======================= +# +# If you have a custom binary/script to handle errors then uncomment +# this line and provide the path. For 'v1' test binary files they also +# handle error cases. + +# repair-binary = +#repair-timeout = 60 + +# The retry-timeout and repair limit are used to handle errors in a +# more robust manner. Errors must persist for longer than this to +# action a repair or reboot, and if repair-maximum attempts are +# made without the test passing a reboot is initiated anyway. + +retry-timeout = 100 +repair-maximum = 100 + +# ====================== User-specified tests ======================== +# +# Specify the directory for auto-added 'v1' test programs (any executable +# found in the 'test-directory should be listed). + +#test-directory = /etc/watchdog.d + +# Specify any v0 custom tests here. Multiple lines are permitted, but +# having any 'v1' programs/scripts discovered in the 'test-directory' is +# the better way. + +test-binary = /usr/local/bin/rootfs-check.sh + +# Specify the time-out value for a test error to be reported. + +test-timeout = 100 + +# ====================== Typical tests =============================== +# + +# Uncomment to enable load average tests for 1, 5 and 15 minute +# averages. Setting one of these values to '0' disables it. These +# values will hopefully never reboot your machine during normal use +# (if your machine is really hung, the loadavg will go much higher +# than 25 in most cases). + +max-load-1 = 24 +#max-load-5 = 18 +#max-load-15 = 12 + +watchdog-device = /dev/watchdog0 \ No newline at end of file