Changeset - b1f75b0584f3
[Not reviewed]
default
2 2 0
drewp@bigasterisk.com - 19 months ago 2023-06-22 06:01:32
drewp@bigasterisk.com
redo 'run' task and 'delete' (less tested)
4 files changed with 68 insertions and 57 deletions:
0 comments (0 inline, 0 general)
config/kustomization.yaml
Show inline comments
 
deleted file
make_global.py
Show inline comments
 
#!/usr/bin/python3
 

	
 
import json
 
import subprocess
 
import sys
 
import time
 

	
 
POM_CERT_NAME = 'pomerium-proxy-tls'
 
AUTH_HOST = 'authenticate2.bigasterisk.com'
 

	
 
def getSuffixedName() -> str:
 
    ns = 'pomerium'
 
(phase,) = sys.argv[1:]
 

	
 

	
 
def secretExists(qname):
 
    ns, localName = qname.split('/')
 
    j = json.loads(subprocess.check_output(["kubectl", "get", "-n", ns, "secret", "-o", "json"]).decode('utf8'))
 
    for item in j['items']:
 
        name = item['metadata']['name']
 
        if name.startswith('pomerium-proxy-tls'):
 
            return ns + '/' + name
 
        if name == localName:
 
            return
 
    raise ValueError()
 

	
 

	
 
def retryGetSuffixedName() -> str:
 
    sys.stderr.write("\nwait for secret: ")
 
def waitForSecret(qname):
 
    sys.stderr.write(f"\nwait for secret {qname}: ")
 
    for tries in range(100):
 
        try:
 
            return getSuffixedName()
 
            return secretExists(qname)
 
        except ValueError:
 
            sys.stderr.write('.')
 
            sys.stderr.flush()
 
            time.sleep(10)
 
    else:
 
        raise ValueError
 

	
 
def pomeriumGlobalConfig():
 

	
 
config = {
 
    'apiVersion': "ingress.pomerium.io/v1",
 
    'kind': "Pomerium",
 
    'metadata': {
 
        'name': "global"
 
    },
 
    'spec': {
 
        'secrets': "pomerium/bootstrap",
 
        'authenticate': {
 
            'url': "https://authenticate.bigasterisk.com"
 
                'url': f"https://{AUTH_HOST}"
 
        },
 
        'cookie': {
 
            'expire': "20h"
 
        },
 
        'identityProvider': {
 
            'provider': "oidc",
 
            'url': "https://accounts.google.com",
 
            'scopes': [
 
                "openid",
 
                "email",
 
                "profile"  # adds name+locale to user details
 
            ],
 
            'secret': "pomerium/idp"
 
        },
 
#        'storage': {
 
#            'postgres': {
 
#                'secret': "pomerium/postgres-connection-key"
 
#            }
 
#        },
 
    }
 
    }
 

	
 
    if phase == 'wait_for_cert':
 
        waitForSecret('pomerium/pomerium-proxy-tls')
 
        config['spec']['certificates'] = [f'pomerium/{POM_CERT_NAME}']
 

	
 
    sys.stderr.write('\n')
 
    return config
 

	
 
def pomCert():
 
    return {
 
  "apiVersion": "cert-manager.io/v1",
 
  "kind": "Certificate",
 
  "metadata": {
 
    "name": POM_CERT_NAME,
 
    "namespace": "pomerium"
 
  },
 
  "spec": {
 
    "dnsNames": [
 
      AUTH_HOST
 
    ],
 
    "issuerRef": {
 
      "kind": "ClusterIssuer",
 
      "name": "letsencrypt-dns-prod"
 
    },
 
    "secretName": "pomerium-proxy-tls"
 
  }
 
}
 

	
 
# Old note: pom won't start up if this cert doesn't exist, so you have to run once
 
# with it commented out, then after cert success, run again with it enabled.
 
if phase == 'output_pom_cert':
 
    output = pomCert()
 
else:
 
    output = pomeriumGlobalConfig()
 

	
 
config['spec']['certificates'] = [
 
    # retryGetSuffixedName() # it appear this is a temporary cert and we should set the line below then wait a few minutes
 
    'pomerium/pomerium-proxy-tls'
 
]
 

	
 
sys.stderr.write('\n')
 

	
 
print(json.dumps(config))
 
print(json.dumps(output))
tasks.py
Show inline comments
 
import sys
 
import time
 
from invoke import task
 
from invoke.exceptions import UnexpectedExit
 

	
 

	
 
def authCert(ctx):
 
    for tries in range(100):
 
        try:
 
            ctx.run("kubectl apply -f config/60-auth-cert.yaml", echo=True, )
 
            sys.stderr.write("worked")
 
            return
 
        except UnexpectedExit:
 
            time.sleep(2)
 
            sys.stderr.write('.')
 
            sys.stderr.flush()
 
    raise ValueError
 
            
 

	
 

	
 
@task
 
def run(ctx):
 
    ctx.run("kubectl delete -n pomerium job/pomerium-gen-secrets --ignore-not-found", echo=True)
 
    ctx.run("skaffold run -f use-invoke-not-skaffold.yaml", echo=True)
 
    authCert(ctx)
 
    ctx.run("./make_global.py | kubectl apply -f -", echo=True)
 
    ctx.run("kubectl kustomize upstream | kubectl apply -f -", echo=True)
 
    print("let CM start up")
 
    time.sleep(15)
 
    ctx.run("kubectl apply -f config/05-idp-secret.yaml", echo=True)
 
    ctx.run("kubectl apply -f config/dns-secret.yaml", echo=True)
 
    # ctx.run("kubectl apply -f config/06-postgres.yaml", echo=True)
 
    ctx.run("kubectl apply -f config/51-pomerium-production-issuer.yaml", echo=True)
 
    ctx.run("kubectl apply -f config/51-pomerium-staging-issuer.yaml", echo=True)
 
    ctx.run("kubectl apply -f config/dns-issuers.yaml", echo=True)
 
    ctx.run("./make_global.py no_cert | kubectl apply -f -", echo=True)
 

	
 
    ctx.run("./make_global.py output_pom_cert | kubectl apply -f -", echo=True)
 
    # that will make infinite certs :( Clean up the redundant requests before LE ratelimits!
 
    #   k delete -n pomerium certificaterequests.cert-manager.io <tab>
 

	
 
    ctx.run("kubectl apply -f ingress/default.yaml", echo=True)
 
    ctx.run("kubectl apply -f ingress/static.yaml", echo=True)
 

	
 
    # this may wait for
 
    # 1) nothing; cert+secret exist
 
    # 2) a letsencrypt session
 
    # 3) a cert-manager delay before a LE session (e.g. 45 minutes)
 
    ctx.run("./make_global.py wait_for_cert | kubectl apply -f -", echo=True)
 

	
 

	
 
@task
 
def delete(ctx):
 
    # todo don't delete certs that have big timeouts to remake
 
    ctx.run("kubectl delete pomerium/global --ignore-not-found", echo=True)
 
    ctx.run("kubectl delete -f config/dns-issuers.yaml --ignore-not-found", echo=True)
 
    ctx.run("kubectl delete -f config/51-pomerium-staging-issuer.yaml --ignore-not-found", echo=True)
 
    ctx.run("kubectl delete -f config/51-pomerium-production-issuer.yaml --ignore-not-found", echo=True)
 
    ctx.run("kubectl delete -f config/60-auth-cert.yaml --ignore-not-found", echo=True)
 
    ctx.run("kubectl delete pomerium/global --ignore-not-found", echo=True)
 
    ctx.run("skaffold delete -f use-invoke-not-skaffold.yaml ", echo=True)
 
    ctx.run("kubectl delete -f config/06-postgres.yaml --ignore-not-found", echo=True)
 
    ctx.run("kubectl delete -f config/05-idp-secret.yaml --ignore-not-found", echo=True)
 

	
 
    # the kustomize workloads and svcs
 
    for type, ns, name in [
 
        ('job', 'pomerium', 'pomerium-gen-secrets'),
 
        ('deploy', 'cert-manager', 'cert-manager'),
 
        ('deploy', 'cert-manager', 'cert-manager-cainjector'),
 
        ('deploy', 'cert-manager', 'cert-manager-webhook'),
 
        ('deploy', 'pomerium', 'pomerium'),
 
        ('service', 'cert-manager', 'cert-manager'),
 
        ('service', 'cert-manager', 'cert-manager-webhook'),
 
        ('service', 'pomerium', 'pomerium-metrics'),
 
        ('service', 'pomerium', 'pomerium-proxy'),
 
    ]:
 
        ctx.run(f"kubectl delete -n {ns} {type} {name} --ignore-not-found", echo=True)
 

	
 
    ctx.run("kubectl delete -n pomerium job/pomerium-gen-secrets --ignore-not-found", echo=True)
 

	
 

	
 
'''
 
troubleshooting, based on 
 
https://cert-manager.io/docs/troubleshooting/
 
then
 
https://cert-manager.io/docs/concepts/acme-orders-challenges/
 

	
 
I had these open:
 
✨ dash(pts/31):~% watch 'kubectl describe -n pomerium issuers.cert-manager.io letsencrypt-staging'
 
✨ dash(pts/31):~% watch 'kubectl describe -n pomerium issuers.cert-manager.io letsencrypt-prod'
 
✨ dash(pts/29):~% watch "kubectl get -n pomerium certificates.cert-manager.io -o wide"
 
✨ dash(pts/36):~% watch 'kubectl describe -n pomerium certificaterequests.cert-manager.io'
 
✨ dash(pts/37):~% watch 'kubectl describe -n pomerium orders.acme.cert-manager.io'
 
✨ dash(pts/38):~% watch 'kubectl describe -n pomerium challenges.acme.cert-manager.io '
 

	
 
then i checked clusterissuer vs issuer, the ns of the 60-auth-cert.yaml resources,
 
and i often restarted cert-manager and eventually pomerium too. 10-pom-pom.yaml last line 
 
may need to be toggled.
 

	
 
The 'cm-acme-http-solver' ingress for LE comes and goes but i didn't have to force it to exist.
 

	
 
Didn't need 04-gen-secrets-job.yaml
 

	
 
Also, CM says this a lot which means it may be afraid to renew bigasterisk.com
 

	
 
    I1213 07:00:01.946799       1 sync.go:394] cert-manager/controller/ingress-shim "msg"="certificate resource is not owned by this object. refusing to update non-owned certificate resource for object" "related_resource_kind"="Certificate" "related_resource_name"="bigasterisk.com-tls" "related_resource_namespace"="default" "related_resource_version"="v1" "resource_kind"="Ingress" "resource_name"="registry" "resource_namespace"="default" "resource_version"="v1"
 

	
 
'''
use-invoke-not-skaffold.yaml
Show inline comments
 
deleted file
0 comments (0 inline, 0 general)