changeset 265:9d0a3915cc00

WIP on k8s+gpu
author drewp@bigasterisk.com
date Tue, 13 Feb 2024 10:43:18 -0800
parents 124cb46da10f
children 08b26e996eef
files kube.py package_lists.py
diffstat 2 files changed, 16 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/kube.py	Tue Feb 13 10:42:38 2024 -0800
+++ b/kube.py	Tue Feb 13 10:43:18 2024 -0800
@@ -95,9 +95,21 @@
     systemd.service(service=service_name, daemon_reload=True, enabled=True, restarted=True)
 
 def setupNvidiaToolkit():
-    server.shell('nvidia-ctk runtime configure --runtime=containerd --config /var/lib/rancher/k3s/agent/etc/containerd/config.toml')
+    # guides:
+    #   https://github.com/NVIDIA/k8s-device-plugin#prerequisites
+    #   https://docs.k3s.io/advanced#nvidia-container-runtime-support
+    # apply this once to kube-system: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.3/nvidia-device-plugin.yml
+    # apply this once: https://raw.githubusercontent.com/NVIDIA/gpu-feature-discovery/v0.8.2/deployments/static/nfd.yaml
+    # and: kubectl apply -f https://raw.githubusercontent.com/NVIDIA/gpu-feature-discovery/v0.8.2/deployments/static/gpu-feature-discovery-daemonset.yaml
+
+    # k3s says they do this:
+    #server.shell('nvidia-ctk runtime configure --runtime=containerd --config /var/lib/rancher/k3s/agent/etc/containerd/config.toml')
+
     # then caller restarts k3s which includes containerd
 
+    # tried https://github.com/k3s-io/k3s/discussions/9231#discussioncomment-8114243
+    pass
+
 def make_cluster(
         server_ip = "10.5.0.1",
         server_node = 'bang',
--- a/package_lists.py	Tue Feb 13 10:42:38 2024 -0800
+++ b/package_lists.py	Tue Feb 13 10:43:18 2024 -0800
@@ -311,4 +311,7 @@
 ]
 k8s_node_with_nvidia_gpu = [
     'nvidia-container-toolkit',
+    'nvidia-container-runtime',
+    'cuda-drivers-fabricmanager-535',
+    'nvidia-headless-535-server',
     ]