Mercurial > code > home > repos > infra
changeset 268:34ab4aec7d4b
notes and changes for getting nvidia gpu k3d support going, which was very hard
author | drewp@bigasterisk.com |
---|---|
date | Wed, 14 Feb 2024 18:48:08 -0800 |
parents | 564b62e59484 |
children | 665a199f7c8a |
files | kube-gpu/start.sh kube.py package_lists.py packages.py templates/sources.list.j2 |
diffstat | 5 files changed, 77 insertions(+), 41 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kube-gpu/start.sh Wed Feb 14 18:48:08 2024 -0800 @@ -0,0 +1,35 @@ +#!/bin/bash + +linux-amd64/helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \ + && linux-amd64/helm repo update +linux-amd64/helm install --wait nvidiagpu \ + -n gpu-operator --create-namespace \ + --set toolkit.env[0].name=CONTAINERD_CONFIG --set toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml \ + --set toolkit.env[1].name=CONTAINERD_SOCKET --set toolkit.env[1].value=/run/k3s/containerd/containerd.sock \ + --set toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS --set toolkit.env[2].value=nvidia \ + --set toolkit.env[3].name=CONTAINERD_SET_AS_DEFAULT --set-string toolkit.env[3].value=true \ + --set validator.driver.env[0].name="DISABLE_DEV_CHAR_SYMLINK_CREATION" --set-string validator.driver.env[0].value="true" \ + nvidia/gpu-operator + +# and maybe k edit ClusterPolicy to do this: + +# Error: error validating driver installation: +# error creating symlink creator: +# failed to create NVIDIA device nodes: +# failed to create device node nvidiactl: +# failed to determine major: +# invalid device node + +# Failed to create symlinks under /dev/char that point to all possible NVIDIA character devices. +# The existence of these symlinks is required to address the following bug: + +# https://github.com/NVIDIA/gpu-operator/issues/430 + +# This bug impacts container runtimes configured with systemd cgroup management enabled. +# To disable the symlink creation, set the following envvar in ClusterPolicy: + +# validator: +# driver: +# env: +# - name: DISABLE_DEV_CHAR_SYMLINK_CREATION +# value: \"true\"" \ No newline at end of file
--- a/kube.py Wed Feb 14 18:45:31 2024 -0800 +++ b/kube.py Wed Feb 14 18:48:08 2024 -0800 @@ -56,6 +56,7 @@ if is_pi: pi_cgroup_setup() + # don't try to get aufs-dkms on rpi-- https://github.com/docker/for-linux/issues/709 def podman_insecure_registry(reg): files.template(src='templates/kube/podman_registries.conf.j2', dest='/etc/containers/registries.conf.d/reg.conf', reg=reg) @@ -91,8 +92,17 @@ dest=f'/etc/systemd/system/{service_name}', role=role, ) - if host.name in ['dash', 'bang', 'ditto']: - setupNvidiaToolkit() + if host.name in ['bang', 'garage']: + # no supported gpu + ''' + kubectl label --overwrite node bang nvidia.com/gpu.deploy.gpu-feature-discovery=false + kubectl label --overwrite node bang nvidia.com/gpu.deploy.container-toolkit=false + kubectl label --overwrite node bang nvidia.com/gpu.deploy.dcgm-exporter=false + kubectl label --overwrite node bang nvidia.com/gpu.deploy.device-plugin=false + kubectl label --overwrite node bang nvidia.com/gpu.deploy.driver=false + kubectl label --overwrite node bang nvidia.com/gpu.deploy.mig-manager=false + kubectl label --overwrite node bang nvidia.com/gpu.deploy.operator-validator=false + ''' systemd.service(service=service_name, daemon_reload=True, enabled=True, restarted=True)
--- a/package_lists.py Wed Feb 14 18:45:31 2024 -0800 +++ b/package_lists.py Wed Feb 14 18:48:08 2024 -0800 @@ -276,42 +276,32 @@ xorg = [ 'kbd', - - # squib GT640 needs driver 470 - # plus is on 535 - # dash is on 525 - # slash is on 525 - - # 'libnvidia-cfg1-525', - # 'libnvidia-common-525', - # # not on slash ,maybe needed for dash/steam? - # # 'libnvidia-compute-525:i386', - # 'libnvidia-compute-525', - # #'libnvidia-decode-525:i386', - # 'libnvidia-decode-525', - # #'libnvidia-encode-525:i386', - # 'libnvidia-encode-525', - # 'libnvidia-extra-525', - # #'libnvidia-fbc1-525:i386', - # 'libnvidia-fbc1-525', - # #'libnvidia-gl-525:i386', - # 'libnvidia-gl-525', - # 'nvidia-compute-utils-525', - # 'nvidia-dkms-525', - # 'nvidia-driver-525', - # 'nvidia-kernel-common-525', - # 'nvidia-kernel-source-525', - # 'nvidia-utils-525', - # 'xserver-xorg-video-nvidia-525', 'nvidia-modprobe', 'nvidia-prime', 'nvidia-settings', 'screen-resolution-extra', 'xserver-xorg', ] -k8s_node_with_nvidia_gpu = [ - 'nvidia-container-toolkit', - 'nvidia-container-runtime', - 'cuda-drivers-fabricmanager-535', - 'nvidia-headless-535-server', - ] + + +def k8s_node_with_nvidia_gpu(hostName): + version = { + 'dash': '545', + 'dot': '545', + 'slash': '525', + 'ditto': '535-server', + # 'bang': '390-server', # no longer in ubuntu + 'squib': '470', + }[hostName] + number = version.replace('-server', '') + return [ + 'nvidia-container-runtime', + f'nvidia-headless-{version}', + f'nvidia-utils-{version}', # this gets mysteriously reverted on ditto- see workaround in packages.py + f'libnvidia-encode-{number}', + f'nvidia-driver-{version}', + ] + ([] if 'server' in version else [ + f'xserver-xorg-video-nvidia-{version}', + ]) + +
--- a/packages.py Wed Feb 14 18:45:31 2024 -0800 +++ b/packages.py Wed Feb 14 18:48:08 2024 -0800 @@ -82,20 +82,21 @@ if host.name == 'plus': apt.packages(packages=package_lists.laptop, **kw) -if host.name in ['dash', 'bang', 'ditto']: - apt.packages(packages=package_lists.k8s_node_with_nvidia_gpu, **kw) +if host.name in ['dash', 'slash', 'ditto', 'dot']: + apt.packages(packages=package_lists.k8s_node_with_nvidia_gpu(host.name)) # no kw, or apt will remove nvidia-utils-VERS (!) +if host.name == 'ditto': + # should have happened in the previous step, but it gets reverted. + apt.packages(packages=['nvidia-utils-535-server']) if not is_pi: apt.packages(packages=package_lists.non_pi, **kw) - if host.name != 'prime': # couldn't get prime to install a newer version than 18.7.0 - nodejs() else: # move to another file? files.template(src="templates/pigpiod.service.j2", dest="/etc/systemd/system/pigpiod.service") systemd.service(service='pigpiod', daemon_reload=True, enabled=True) -desktop_env = host.name in ['dash', 'slash', 'plus', 'squib'] +desktop_env = host.name in ['dash', 'slash', 'plus', 'dot', 'squib'] if desktop_env: apt.packages(packages=package_lists.xorg + package_lists.desktop, **kw) if desktop_env or host.name in ['bang', 'ditto']:
--- a/templates/sources.list.j2 Wed Feb 14 18:45:31 2024 -0800 +++ b/templates/sources.list.j2 Wed Feb 14 18:48:08 2024 -0800 @@ -12,7 +12,7 @@ {% endif %} # k8s node with nvidia gpu -{% if host.name in ['dash', 'bang', 'ditto'] %} +{% if host.name in ['dash', 'ditto', 'slash', 'dot'] %} deb [signed-by=/etc/apt/keyrings/nvidia.gpg] https://nvidia.github.io/libnvidia-container/stable/deb/$(ARCH) / {% endif %}