Mercurial > code > home > repos > infra
diff kube.py @ 265:9d0a3915cc00
WIP on k8s+gpu
author | drewp@bigasterisk.com |
---|---|
date | Tue, 13 Feb 2024 10:43:18 -0800 |
parents | 47f5aca39a68 |
children | 564b62e59484 |
line wrap: on
line diff
--- a/kube.py Tue Feb 13 10:42:38 2024 -0800 +++ b/kube.py Tue Feb 13 10:43:18 2024 -0800 @@ -95,9 +95,21 @@ systemd.service(service=service_name, daemon_reload=True, enabled=True, restarted=True) def setupNvidiaToolkit(): - server.shell('nvidia-ctk runtime configure --runtime=containerd --config /var/lib/rancher/k3s/agent/etc/containerd/config.toml') + # guides: + # https://github.com/NVIDIA/k8s-device-plugin#prerequisites + # https://docs.k3s.io/advanced#nvidia-container-runtime-support + # apply this once to kube-system: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.3/nvidia-device-plugin.yml + # apply this once: https://raw.githubusercontent.com/NVIDIA/gpu-feature-discovery/v0.8.2/deployments/static/nfd.yaml + # and: kubectl apply -f https://raw.githubusercontent.com/NVIDIA/gpu-feature-discovery/v0.8.2/deployments/static/gpu-feature-discovery-daemonset.yaml + + # k3s says they do this: + #server.shell('nvidia-ctk runtime configure --runtime=containerd --config /var/lib/rancher/k3s/agent/etc/containerd/config.toml') + # then caller restarts k3s which includes containerd + # tried https://github.com/k3s-io/k3s/discussions/9231#discussioncomment-8114243 + pass + def make_cluster( server_ip = "10.5.0.1", server_node = 'bang',