changeset 268:34ab4aec7d4b

notes and changes for getting nvidia gpu k3d support going, which was very hard
author drewp@bigasterisk.com
date Wed, 14 Feb 2024 18:48:08 -0800
parents 564b62e59484
children 665a199f7c8a
files kube-gpu/start.sh kube.py package_lists.py packages.py templates/sources.list.j2
diffstat 5 files changed, 77 insertions(+), 41 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kube-gpu/start.sh	Wed Feb 14 18:48:08 2024 -0800
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+linux-amd64/helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \
+   && linux-amd64/helm repo update
+linux-amd64/helm install --wait nvidiagpu \
+     -n gpu-operator --create-namespace \
+    --set toolkit.env[0].name=CONTAINERD_CONFIG --set toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml \
+    --set toolkit.env[1].name=CONTAINERD_SOCKET --set toolkit.env[1].value=/run/k3s/containerd/containerd.sock \
+    --set toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS --set toolkit.env[2].value=nvidia \
+    --set toolkit.env[3].name=CONTAINERD_SET_AS_DEFAULT --set-string toolkit.env[3].value=true \
+    --set validator.driver.env[0].name="DISABLE_DEV_CHAR_SYMLINK_CREATION" --set-string validator.driver.env[0].value="true" \
+     nvidia/gpu-operator
+
+# and maybe k edit ClusterPolicy to do this:
+
+# Error: error validating driver installation: 
+# error creating symlink creator: 
+# failed to create NVIDIA device nodes: 
+# failed to create device node nvidiactl: 
+# failed to determine major: 
+# invalid device node
+
+# Failed to create symlinks under /dev/char that point to all possible NVIDIA character devices.
+# The existence of these symlinks is required to address the following bug:
+
+#     https://github.com/NVIDIA/gpu-operator/issues/430
+
+# This bug impacts container runtimes configured with systemd cgroup management enabled.
+# To disable the symlink creation, set the following envvar in ClusterPolicy:
+
+#     validator:
+#       driver:
+#         env:
+#         - name: DISABLE_DEV_CHAR_SYMLINK_CREATION
+#           value: \"true\""
\ No newline at end of file
--- a/kube.py	Wed Feb 14 18:45:31 2024 -0800
+++ b/kube.py	Wed Feb 14 18:48:08 2024 -0800
@@ -56,6 +56,7 @@
     if is_pi:
         pi_cgroup_setup()
 
+
 # don't try to get aufs-dkms on rpi-- https://github.com/docker/for-linux/issues/709
 def podman_insecure_registry(reg):
     files.template(src='templates/kube/podman_registries.conf.j2', dest='/etc/containers/registries.conf.d/reg.conf', reg=reg)
@@ -91,8 +92,17 @@
         dest=f'/etc/systemd/system/{service_name}',
         role=role,
     )
-    if host.name in ['dash', 'bang', 'ditto']:
-        setupNvidiaToolkit()
+    if host.name in ['bang', 'garage']:
+        # no supported gpu
+        '''
+            kubectl label --overwrite node bang nvidia.com/gpu.deploy.gpu-feature-discovery=false
+            kubectl label --overwrite node bang nvidia.com/gpu.deploy.container-toolkit=false
+            kubectl label --overwrite node bang nvidia.com/gpu.deploy.dcgm-exporter=false
+            kubectl label --overwrite node bang nvidia.com/gpu.deploy.device-plugin=false
+            kubectl label --overwrite node bang nvidia.com/gpu.deploy.driver=false
+            kubectl label --overwrite node bang nvidia.com/gpu.deploy.mig-manager=false
+            kubectl label --overwrite node bang nvidia.com/gpu.deploy.operator-validator=false
+        '''
     systemd.service(service=service_name, daemon_reload=True, enabled=True, restarted=True)
 
 
--- a/package_lists.py	Wed Feb 14 18:45:31 2024 -0800
+++ b/package_lists.py	Wed Feb 14 18:48:08 2024 -0800
@@ -276,42 +276,32 @@
 
 xorg = [
     'kbd',
-
-    # squib GT640 needs driver 470
-    # plus is on 535
-    # dash is on 525
-    # slash is on 525
-
-    # 'libnvidia-cfg1-525',
-    # 'libnvidia-common-525',
-    # # not  on slash ,maybe needed for dash/steam?
-    # #    'libnvidia-compute-525:i386',
-    # 'libnvidia-compute-525',
-    # #'libnvidia-decode-525:i386',
-    # 'libnvidia-decode-525',
-    # #'libnvidia-encode-525:i386',
-    # 'libnvidia-encode-525',
-    # 'libnvidia-extra-525',
-    # #'libnvidia-fbc1-525:i386',
-    # 'libnvidia-fbc1-525',
-    # #'libnvidia-gl-525:i386',
-    # 'libnvidia-gl-525',
-    # 'nvidia-compute-utils-525',
-    # 'nvidia-dkms-525',
-    # 'nvidia-driver-525',
-    # 'nvidia-kernel-common-525',
-    # 'nvidia-kernel-source-525',
-    # 'nvidia-utils-525',
-    # 'xserver-xorg-video-nvidia-525',
     'nvidia-modprobe',
     'nvidia-prime',
     'nvidia-settings',
     'screen-resolution-extra',
     'xserver-xorg',
 ]
-k8s_node_with_nvidia_gpu = [
-    'nvidia-container-toolkit',
-    'nvidia-container-runtime',
-    'cuda-drivers-fabricmanager-535',
-    'nvidia-headless-535-server',
-    ]
+
+
+def k8s_node_with_nvidia_gpu(hostName):
+    version = {
+        'dash': '545',
+        'dot': '545',
+        'slash': '525',
+        'ditto': '535-server',
+        # 'bang': '390-server',  # no longer in ubuntu
+        'squib': '470',
+    }[hostName]
+    number = version.replace('-server', '')
+    return [
+        'nvidia-container-runtime',
+        f'nvidia-headless-{version}',
+        f'nvidia-utils-{version}',  # this gets mysteriously reverted on ditto- see workaround in packages.py
+        f'libnvidia-encode-{number}',
+        f'nvidia-driver-{version}',
+    ] + ([] if 'server' in version else [
+        f'xserver-xorg-video-nvidia-{version}',
+    ])
+
+
--- a/packages.py	Wed Feb 14 18:45:31 2024 -0800
+++ b/packages.py	Wed Feb 14 18:48:08 2024 -0800
@@ -82,20 +82,21 @@
 if host.name == 'plus':
     apt.packages(packages=package_lists.laptop, **kw)
 
-if host.name in ['dash', 'bang', 'ditto']:
-    apt.packages(packages=package_lists.k8s_node_with_nvidia_gpu, **kw)
+if host.name in ['dash', 'slash', 'ditto', 'dot']:
+    apt.packages(packages=package_lists.k8s_node_with_nvidia_gpu(host.name)) # no kw, or apt will remove nvidia-utils-VERS (!)
 
+if host.name == 'ditto':
+    # should have happened in the previous step, but it gets reverted.
+    apt.packages(packages=['nvidia-utils-535-server'])
 
 if not is_pi:
     apt.packages(packages=package_lists.non_pi, **kw)
-    if host.name != 'prime':  # couldn't get prime to install a newer version than 18.7.0
-        nodejs()
 else:
     # move to another file?
     files.template(src="templates/pigpiod.service.j2", dest="/etc/systemd/system/pigpiod.service")
     systemd.service(service='pigpiod', daemon_reload=True, enabled=True)
 
-desktop_env = host.name in ['dash', 'slash', 'plus', 'squib']
+desktop_env = host.name in ['dash', 'slash', 'plus', 'dot', 'squib']
 if desktop_env:
     apt.packages(packages=package_lists.xorg + package_lists.desktop, **kw)
 if desktop_env or host.name in ['bang', 'ditto']:
--- a/templates/sources.list.j2	Wed Feb 14 18:45:31 2024 -0800
+++ b/templates/sources.list.j2	Wed Feb 14 18:48:08 2024 -0800
@@ -12,7 +12,7 @@
 {% endif %}
 
 # k8s node with nvidia gpu
-{% if host.name in ['dash', 'bang', 'ditto'] %}
+{% if host.name in ['dash', 'ditto', 'slash', 'dot'] %}
 deb [signed-by=/etc/apt/keyrings/nvidia.gpg] https://nvidia.github.io/libnvidia-container/stable/deb/$(ARCH) /
 {% endif %}