view kube-gpu/start.sh @ 332:d4893670f888 default tip

WIP: use watchdog reboot timer on pi
author drewp@bigasterisk.com
date Thu, 27 Feb 2025 11:09:29 -0800
parents 34ab4aec7d4b
children
line wrap: on
line source

#!/bin/bash

linux-amd64/helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \
   && linux-amd64/helm repo update
linux-amd64/helm install --wait nvidiagpu \
     -n gpu-operator --create-namespace \
    --set toolkit.env[0].name=CONTAINERD_CONFIG --set toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml \
    --set toolkit.env[1].name=CONTAINERD_SOCKET --set toolkit.env[1].value=/run/k3s/containerd/containerd.sock \
    --set toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS --set toolkit.env[2].value=nvidia \
    --set toolkit.env[3].name=CONTAINERD_SET_AS_DEFAULT --set-string toolkit.env[3].value=true \
    --set validator.driver.env[0].name="DISABLE_DEV_CHAR_SYMLINK_CREATION" --set-string validator.driver.env[0].value="true" \
     nvidia/gpu-operator

# and maybe k edit ClusterPolicy to do this:

# Error: error validating driver installation: 
# error creating symlink creator: 
# failed to create NVIDIA device nodes: 
# failed to create device node nvidiactl: 
# failed to determine major: 
# invalid device node

# Failed to create symlinks under /dev/char that point to all possible NVIDIA character devices.
# The existence of these symlinks is required to address the following bug:

#     https://github.com/NVIDIA/gpu-operator/issues/430

# This bug impacts container runtimes configured with systemd cgroup management enabled.
# To disable the symlink creation, set the following envvar in ClusterPolicy:

#     validator:
#       driver:
#         env:
#         - name: DISABLE_DEV_CHAR_SYMLINK_CREATION
#           value: \"true\""