diff kube.py @ 265:9d0a3915cc00

WIP on k8s+gpu
author drewp@bigasterisk.com
date Tue, 13 Feb 2024 10:43:18 -0800
parents 47f5aca39a68
children 564b62e59484
line wrap: on
line diff
--- a/kube.py	Tue Feb 13 10:42:38 2024 -0800
+++ b/kube.py	Tue Feb 13 10:43:18 2024 -0800
@@ -95,9 +95,21 @@
     systemd.service(service=service_name, daemon_reload=True, enabled=True, restarted=True)
 
 def setupNvidiaToolkit():
-    server.shell('nvidia-ctk runtime configure --runtime=containerd --config /var/lib/rancher/k3s/agent/etc/containerd/config.toml')
+    # guides:
+    #   https://github.com/NVIDIA/k8s-device-plugin#prerequisites
+    #   https://docs.k3s.io/advanced#nvidia-container-runtime-support
+    # apply this once to kube-system: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.3/nvidia-device-plugin.yml
+    # apply this once: https://raw.githubusercontent.com/NVIDIA/gpu-feature-discovery/v0.8.2/deployments/static/nfd.yaml
+    # and: kubectl apply -f https://raw.githubusercontent.com/NVIDIA/gpu-feature-discovery/v0.8.2/deployments/static/gpu-feature-discovery-daemonset.yaml
+
+    # k3s says they do this:
+    #server.shell('nvidia-ctk runtime configure --runtime=containerd --config /var/lib/rancher/k3s/agent/etc/containerd/config.toml')
+
     # then caller restarts k3s which includes containerd
 
+    # tried https://github.com/k3s-io/k3s/discussions/9231#discussioncomment-8114243
+    pass
+
 def make_cluster(
         server_ip = "10.5.0.1",
         server_node = 'bang',