diff kube-gpu/start.sh @ 268:34ab4aec7d4b

notes and changes for getting nvidia gpu k3d support going, which was very hard
author drewp@bigasterisk.com
date Wed, 14 Feb 2024 18:48:08 -0800
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kube-gpu/start.sh	Wed Feb 14 18:48:08 2024 -0800
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+linux-amd64/helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \
+   && linux-amd64/helm repo update
+linux-amd64/helm install --wait nvidiagpu \
+     -n gpu-operator --create-namespace \
+    --set toolkit.env[0].name=CONTAINERD_CONFIG --set toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml \
+    --set toolkit.env[1].name=CONTAINERD_SOCKET --set toolkit.env[1].value=/run/k3s/containerd/containerd.sock \
+    --set toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS --set toolkit.env[2].value=nvidia \
+    --set toolkit.env[3].name=CONTAINERD_SET_AS_DEFAULT --set-string toolkit.env[3].value=true \
+    --set validator.driver.env[0].name="DISABLE_DEV_CHAR_SYMLINK_CREATION" --set-string validator.driver.env[0].value="true" \
+     nvidia/gpu-operator
+
+# and maybe k edit ClusterPolicy to do this:
+
+# Error: error validating driver installation: 
+# error creating symlink creator: 
+# failed to create NVIDIA device nodes: 
+# failed to create device node nvidiactl: 
+# failed to determine major: 
+# invalid device node
+
+# Failed to create symlinks under /dev/char that point to all possible NVIDIA character devices.
+# The existence of these symlinks is required to address the following bug:
+
+#     https://github.com/NVIDIA/gpu-operator/issues/430
+
+# This bug impacts container runtimes configured with systemd cgroup management enabled.
+# To disable the symlink creation, set the following envvar in ClusterPolicy:
+
+#     validator:
+#       driver:
+#         env:
+#         - name: DISABLE_DEV_CHAR_SYMLINK_CREATION
+#           value: \"true\""
\ No newline at end of file