Mercurial > code > home > repos > infra
view kube.py @ 307:89b948759d64
update skaffold
author | drewp@bigasterisk.com |
---|---|
date | Sat, 24 Aug 2024 15:07:26 -0700 |
parents | 9e15c07d5258 |
children | a135aa45861a |
line wrap: on
line source
import io import os import subprocess from tempfile import NamedTemporaryFile from pyinfra import host from pyinfra.facts.files import FindInFile from pyinfra.facts.server import Arch, LinuxDistribution from pyinfra.operations import files, server, systemd, apt # https://github.com/GoogleContainerTools/skaffold/releases skaffold_version = 'v2.13.2' def download_k3s(k3s_version): tail = 'k3s' if host.get_fact(Arch) == 'x86_64' else 'k3s-armhf' if host.get_fact(Arch) == 'aarch64': tail = 'k3s-arm64' files.download( src=f'https://github.com/rancher/k3s/releases/download/{k3s_version}/{tail}', dest='/usr/local/bin/k3s', user='root', group='root', mode='755', cache_time=43000, # force=True, # to get a new version ) def install_skaffold(reg): files.download(src=f'https://storage.googleapis.com/skaffold/releases/{skaffold_version}/skaffold-linux-amd64', dest='/usr/local/bin/skaffold', user='root', group='root', mode='755', cache_time=1000) # one time; writes to $HOME server.shell(commands=f"skaffold config set --global insecure-registries {reg}") def host_prep(): server.sysctl(key='net.ipv4.ip_forward', value="1", persist=True) server.sysctl(key='net.ipv6.conf.all.forwarding', value="1", persist=True) server.sysctl(key='fs.inotify.max_user_instances', value='8192', persist=True) server.sysctl(key='fs.inotify.max_user_watches', value='524288', persist=True) # https://sysctl-explorer.net/net/ipv4/rp_filter/ none, strict, loose = 0, 1, 2 server.sysctl(key='net.ipv4.conf.default.rp_filter', value=loose, persist=True) # don't try to get aufs-dkms on rpi-- https://github.com/docker/for-linux/issues/709 def podman_insecure_registry(reg): # docs: https://rancher.com/docs/k3s/latest/en/installation/private-registry/ # user confusions: https://github.com/rancher/k3s/issues/1802 files.template(src='templates/kube/registries.yaml.j2', dest='/etc/rancher/k3s/registries.yaml', reg=reg) files.template(src='templates/kube/podman_registries.conf.j2', dest='/etc/containers/registries.conf.d/reg.conf', reg=reg) if host.data.get('k8s_admin'): systemd.service(service='podman', user_mode=True) systemd.service(service='podman.socket', user_mode=True) # and maybe edit /etc/containers/policy.json def config_and_run_service(k3s_version, server_node, server_ip): download_k3s(k3s_version) service_name = 'k3s.service' if host.name == server_node else 'k3s-node.service' role = 'server' if host.name == server_node else 'agent' which_conf = 'config-server.yaml.j2' if host.name == server_node else 'config-agent.yaml.j2' files.put(src="files/kube/kubelet.config", dest="/etc/rancher/k3s/kubelet.config") # /var/lib/rancher/k3s/server/node-token is the source of the string in secrets/k3s_token, # so this presumes a previous run if host.name == server_node: token = "ununsed" else: # this assumes localhost is the k3s server. if not os.path.exists('/var/lib/rancher/k3s/server/node-token'): print("first pass is for server only- skipping other nodes") return token = open('/var/lib/rancher/k3s/server/node-token', 'rt').read().strip() files.template( src=f'templates/kube/{which_conf}', dest='/etc/k3s_config.yaml', server_ip=server_ip, token=token, wg_ip=host.host_data['wireguard_address'], ) files.template( src='templates/kube/k3s.service.j2', dest=f'/etc/systemd/system/{service_name}', role=role, ) if not host.data.get('gpu'): # no supported gpu ''' kubectl label --overwrite node bang nvidia.com/gpu.deploy.gpu-feature-discovery=false kubectl label --overwrite node bang nvidia.com/gpu.deploy.container-toolkit=false kubectl label --overwrite node bang nvidia.com/gpu.deploy.dcgm-exporter=false kubectl label --overwrite node bang nvidia.com/gpu.deploy.device-plugin=false kubectl label --overwrite node bang nvidia.com/gpu.deploy.driver=false kubectl label --overwrite node bang nvidia.com/gpu.deploy.mig-manager=false kubectl label --overwrite node bang nvidia.com/gpu.deploy.operator-validator=false ''' systemd.service(service=service_name, daemon_reload=True, enabled=True, restarted=True) def setupNvidiaToolkit(): # guides: # https://github.com/NVIDIA/k8s-device-plugin#prerequisites # https://docs.k3s.io/advanced#nvidia-container-runtime-support # apply this once to kube-system: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.14.3/nvidia-device-plugin.yml # apply this once: https://raw.githubusercontent.com/NVIDIA/gpu-feature-discovery/v0.8.2/deployments/static/nfd.yaml # and: kubectl apply -f https://raw.githubusercontent.com/NVIDIA/gpu-feature-discovery/v0.8.2/deployments/static/gpu-feature-discovery-daemonset.yaml # k3s says they do this: #server.shell('nvidia-ctk runtime configure --runtime=containerd --config /var/lib/rancher/k3s/agent/etc/containerd/config.toml') # then caller restarts k3s which includes containerd # tried https://github.com/k3s-io/k3s/discussions/9231#discussioncomment-8114243 pass def make_cluster( server_ip, server_node, nodes, # https://github.com/k3s-io/k3s/releases # 1.23.6 per https://github.com/cilium/cilium/issues/20331 k3s_version, ): if host.name in nodes + [server_node]: host_prep() files.directory(path='/etc/rancher/k3s') podman_insecure_registry(reg='reg:5000') # also note that podman dropped the default `docker.io/` prefix on image names (see https://unix.stackexchange.com/a/701785/419418) config_and_run_service(k3s_version, server_node, server_ip) if host.data.get('k8s_admin'): files.directory(path='/etc/rancher/k3s') install_skaffold("reg:5000") files.link(path='/usr/local/bin/kubectl', target='/usr/local/bin/k3s') files.directory(path='/home/drewp/.kube', user='drewp', group='drewp') # assumes our pyinfra process is running on server_node files.put( src='/etc/rancher/k3s/k3s.yaml', dest='/etc/rancher/k3s/k3s.yaml', # user='root', group='drewp', mode='640') server.shell( commands=f"kubectl config set-cluster default --server=https://{server_ip}:6443 --kubeconfig=/etc/rancher/k3s/k3s.yaml" ) def run_non_k8s_telegraf(node): if host.name != node: return # this CM is written by /my/serv/telegraf/tasks.py conf = io.BytesIO(subprocess.check_output(["kubectl", "get", "cm", "telegraf-config", "-o", "jsonpath={.data." + node + "}"])) apt.packages(packages=['telegraf']) files.put(src=conf, dest="/etc/telegraf/telegraf.conf", create_remote_dir=True, assume_exists=True) systemd.service( service='telegraf', running=True, enabled=True, restarted=True, ) make_cluster( server_ip="10.5.0.7", server_node='ditto', nodes=[ 'bang', 'slash', 'dash', 'ws-printer', # 'gn-music', 'li-drums', ], k3s_version='v1.29.1+k3s1') run_non_k8s_telegraf('pipe') # consider https://github.com/derailed/k9s/releases/download/v0.32.4/k9s_Linux_amd64.tar.gz # k label node ws-printer unschedulable=octoprint-allowed