nodelocaldns: allow a secondary pod for nodelocaldns for local-HA (#8100)

* nodelocaldns: allow a secondary pod for nodelocaldns for local-HA * CI: add job to test nodelocaldns secondary
3 years ago · 039205560a
12 changed files with 281 additions and 17 deletions
--- a/.gitlab-ci/packet.yml
+++ b/.gitlab-ci/packet.yml
@ -194,6 +194,11 @@ packet_amazon-linux-2-aio:
  extends: .packet_pr
  when: manual

+packet_centos8-calico-nodelocaldns-secondary:
+  stage: deploy-part2
+  extends: .packet_pr
+  when: manual
+
 packet_fedora34-kube-ovn-containerd:
  stage: deploy-part2
  extends: .packet_periodic
--- a/docs/dns-stack.md
+++ b/docs/dns-stack.md
@ -212,6 +212,22 @@ nodelocaldns_external_zones:

 See [dns_etchosts](#dns_etchosts-coredns) above.

+### Nodelocal DNS HA
+
+Under some circumstances the single POD nodelocaldns implementation may not be able to be replaced soon enough and a cluster upgrade or a nodelocaldns upgrade can cause DNS requests to time out for short intervals. If for any reason your applications cannot tollerate this behavior you can enable a redundant nodelocal DNS pod on each node:
+
+```yaml
+enable_nodelocaldns_secondary: true
+```
+
+**Note:** when the nodelocaldns secondary is enabled, the primary is instructed to no longer tear down the iptables rules it sets up to direct traffic to itself. In case both daemonsets have failing pods on the same node, this can cause a DNS blackout with traffic no longer being forwarded to the coredns central service as a fallback. Please ensure you account for this also if you decide to disable the nodelocaldns cache.
+
+There is a time delta (in seconds) allowed for the secondary nodelocaldns to survive in case both primary and secondary daemonsets are updated at the same time. It is advised to tune this variable after you have performed some tests in your own environment.
+
+```yaml
+nodelocaldns_secondary_skew_seconds: 5
+```
+
 ## Limitations

 * Kubespray has yet ways to configure Kubedns addon to forward requests SkyDns can
--- a/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml
+++ b/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml
@ -166,9 +166,12 @@ dns_mode: coredns
 # manual_dns_server: 10.x.x.x
 # Enable nodelocal dns cache
 enable_nodelocaldns: true
+enable_nodelocaldns_secondary: false
 nodelocaldns_ip: 169.254.25.10
 nodelocaldns_health_port: 9254
+nodelocaldns_second_health_port: 9256
 nodelocaldns_bind_metrics_host_ip: false
+nodelocaldns_secondary_skew_seconds: 5
 # nodelocaldns_external_zones:
 # - zones:
 #   - example.com
--- a/roles/download/defaults/main.yml
+++ b/roles/download/defaults/main.yml
@ -610,7 +610,7 @@ coredns_image_is_namespaced: "{{ (kube_version is version('v1.21.0','>=')) or (c
 coredns_image_repo: "{{ kube_image_repo }}{{'/coredns/coredns' if (coredns_image_is_namespaced | bool) else '/coredns' }}"
 coredns_image_tag: "{{ coredns_version if (coredns_image_is_namespaced | bool) else (coredns_version | regex_replace('^v', '')) }}"

-nodelocaldns_version: "1.17.1"
+nodelocaldns_version: "1.21.1"
 nodelocaldns_image_repo: "{{ kube_image_repo }}/dns/k8s-dns-node-cache"
 nodelocaldns_image_tag: "{{ nodelocaldns_version }}"

--- a/roles/kubernetes-apps/ansible/defaults/main.yml
+++ b/roles/kubernetes-apps/ansible/defaults/main.yml
@ -17,6 +17,8 @@ nodelocaldns_cpu_requests: 100m
 nodelocaldns_memory_limit: 170Mi
 nodelocaldns_memory_requests: 70Mi
 nodelocaldns_ds_nodeselector: "kubernetes.io/os: linux"
+nodelocaldns_prometheus_port: 9253
+nodelocaldns_secondary_prometheus_port: 9255

 # Limits for dns-autoscaler
 dns_autoscaler_cpu_requests: 20m
--- a/roles/kubernetes-apps/ansible/tasks/main.yml
+++ b/roles/kubernetes-apps/ansible/tasks/main.yml
@ -48,6 +48,7 @@
    - "{{ coredns_manifests.results | default({}) }}"
    - "{{ coredns_secondary_manifests.results | default({}) }}"
    - "{{ nodelocaldns_manifests.results | default({}) }}"
+    - "{{ nodelocaldns_second_manifests.results | default({}) }}"
  when:
    - dns_mode != 'none'
    - inventory_hostname == groups['kube_control_plane'][0]
--- a/roles/kubernetes-apps/ansible/tasks/nodelocaldns.yml
+++ b/roles/kubernetes-apps/ansible/tasks/nodelocaldns.yml
@ -43,3 +43,31 @@
  tags:
    - nodelocaldns
    - coredns
+
+- name: Kubernetes Apps | Lay Down nodelocaldns-secondary Template
+  template:
+    src: "{{ item.file }}.j2"
+    dest: "{{ kube_config_dir }}/{{ item.file }}"
+  with_items:
+    - { name: nodelocaldns, file: nodelocaldns-second-daemonset.yml, type: daemonset }
+  register: nodelocaldns_second_manifests
+  vars:
+    forwardTarget: >-
+      {%- if secondaryclusterIP is defined and dns_mode == 'coredns_dual' -%}
+      {{ primaryClusterIP }} {{ secondaryclusterIP }}
+      {%- else -%}
+      {{ primaryClusterIP }}
+      {%- endif -%}
+    upstreamForwardTarget: >-
+      {%- if resolvconf_mode == 'host_resolvconf' and upstream_dns_servers is defined and upstream_dns_servers|length > 0 -%}
+      {{ upstream_dns_servers|join(' ') }}
+      {%- else -%}
+      /etc/resolv.conf
+      {%- endif -%}
+  when:
+    - enable_nodelocaldns
+    - enable_nodelocaldns_secondary
+    - inventory_hostname == groups['kube_control_plane'] | first
+  tags:
+    - nodelocaldns
+    - coredns
--- a/roles/kubernetes-apps/ansible/templates/nodelocaldns-config.yml.j2
+++ b/roles/kubernetes-apps/ansible/templates/nodelocaldns-config.yml.j2
@ -17,7 +17,7 @@ data:
        loop
        bind {{ nodelocaldns_ip }}
        forward . {{ block['nameservers'] | join(' ') }}
-        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }}
        log
 {% if dns_etchosts | default(None) %}
        hosts /etc/coredns/hosts {
@ -39,7 +39,7 @@ data:
        forward . {{ forwardTarget }} {
            force_tcp
        }
-        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }}
        health {{ nodelocaldns_ip }}:{{ nodelocaldns_health_port }}
 {% if dns_etchosts | default(None) %}
        hosts /etc/coredns/hosts {
@ -56,7 +56,7 @@ data:
        forward . {{ forwardTarget }} {
            force_tcp
        }
-        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }}
    }
    ip6.arpa:53 {
        errors
@ -67,7 +67,7 @@ data:
        forward . {{ forwardTarget }} {
            force_tcp
        }
-        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }}
    }
    .:53 {
        errors
@ -76,13 +76,91 @@ data:
        loop
        bind {{ nodelocaldns_ip }}
        forward . {{ upstreamForwardTarget }}
-        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }}
 {% if dns_etchosts | default(None) %}
        hosts /etc/coredns/hosts {
          fallthrough
        }
 {% endif %}
    }
+{% if enable_nodelocaldns_secondary %}
+  Corefile-second: |
+{% if nodelocaldns_external_zones is defined and nodelocaldns_external_zones|length > 0 %}
+{% for block in nodelocaldns_external_zones %}
+    {{ block['zones'] | join(' ') }} {
+        errors
+        cache {{ block['cache'] | default(30) }}
+        reload
+        loop
+        bind {{ nodelocaldns_ip }}
+        forward . {{ block['nameservers'] | join(' ') }}
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }}
+        log
+{% if dns_etchosts | default(None) %}
+        hosts /etc/coredns/hosts {
+          fallthrough
+        }
+{% endif %}
+    }
+{% endfor %}
+{% endif %}
+    {{ dns_domain }}:53 {
+        errors
+        cache {
+            success 9984 30
+            denial 9984 5
+        }
+        reload
+        loop
+        bind {{ nodelocaldns_ip }}
+        forward . {{ forwardTarget }} {
+            force_tcp
+        }
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }}
+        health {{ nodelocaldns_ip }}:{{ nodelocaldns_second_health_port }}
+{% if dns_etchosts | default(None) %}
+        hosts /etc/coredns/hosts {
+          fallthrough
+        }
+{% endif %}
+    }
+    in-addr.arpa:53 {
+        errors
+        cache 30
+        reload
+        loop
+        bind {{ nodelocaldns_ip }}
+        forward . {{ forwardTarget }} {
+            force_tcp
+        }
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }}
+    }
+    ip6.arpa:53 {
+        errors
+        cache 30
+        reload
+        loop
+        bind {{ nodelocaldns_ip }}
+        forward . {{ forwardTarget }} {
+            force_tcp
+        }
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }}
+    }
+    .:53 {
+        errors
+        cache 30
+        reload
+        loop
+        bind {{ nodelocaldns_ip }}
+        forward . {{ upstreamForwardTarget }}
+        prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }}
+{% if dns_etchosts | default(None) %}
+        hosts /etc/coredns/hosts {
+          fallthrough
+        }
+{% endif %}
+    }
+{% endif %}
 {% if dns_etchosts | default(None) %}
  hosts: |
    {{ dns_etchosts | indent(width=4, indentfirst=None) }}
--- a/roles/kubernetes-apps/ansible/templates/nodelocaldns-daemonset.yml.j2
+++ b/roles/kubernetes-apps/ansible/templates/nodelocaldns-daemonset.yml.j2
@ -16,7 +16,7 @@ spec:
        k8s-app: nodelocaldns
      annotations:
        prometheus.io/scrape: 'true'
-        prometheus.io/port: '9253'
+        prometheus.io/port: '{{ nodelocaldns_prometheus_port }}'
    spec:
      nodeSelector:
        {{ nodelocaldns_ds_nodeselector }}
@ -38,16 +38,16 @@ spec:
          requests:
            cpu: {{ nodelocaldns_cpu_requests }}
            memory: {{ nodelocaldns_memory_requests }}
-        args: [ "-localip", "{{ nodelocaldns_ip }}", "-conf", "/etc/coredns/Corefile", "-upstreamsvc", "coredns" ]
-        securityContext:
-          privileged: true
-{% if nodelocaldns_bind_metrics_host_ip %}
-        env:
-          - name: MY_HOST_IP
-            valueFrom:
-              fieldRef:
-                fieldPath: status.hostIP
-{% endif %}
+        args:
+        - -localip
+        - {{ nodelocaldns_ip }}
+        - -conf
+        - /etc/coredns/Corefile
+        - -upstreamsvc
+        - coredns
+{% if enable_nodelocaldns_secondary %}
+        - -skipteardown
+{% else %}
        ports:
        - containerPort: 53
          name: dns
@ -58,6 +58,16 @@ spec:
        - containerPort: 9253
          name: metrics
          protocol: TCP
+{% endif %}
+        securityContext:
+          privileged: true
+{% if nodelocaldns_bind_metrics_host_ip %}
+        env:
+          - name: MY_HOST_IP
+            valueFrom:
+              fieldRef:
+                fieldPath: status.hostIP
+{% endif %}
        livenessProbe:
          httpGet:
            host: {{ nodelocaldns_ip }}
--- a/roles/kubernetes-apps/ansible/templates/nodelocaldns-second-daemonset.yml.j2
+++ b/roles/kubernetes-apps/ansible/templates/nodelocaldns-second-daemonset.yml.j2
@ -0,0 +1,103 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nodelocaldns-second
+  namespace: kube-system
+  labels:
+    k8s-app: kube-dns
+    addonmanager.kubernetes.io/mode: Reconcile
+spec:
+  selector:
+    matchLabels:
+      k8s-app: nodelocaldns-second
+  template:
+    metadata:
+      labels:
+        k8s-app: nodelocaldns-second
+      annotations:
+        prometheus.io/scrape: 'true'
+        prometheus.io/port: '{{ nodelocaldns_secondary_prometheus_port }}'
+    spec:
+      nodeSelector:
+        {{ nodelocaldns_ds_nodeselector }}
+      priorityClassName: system-cluster-critical
+      serviceAccountName: nodelocaldns
+      hostNetwork: true
+      dnsPolicy: Default  # Don't use cluster DNS.
+      tolerations:
+      - effect: NoSchedule
+        operator: "Exists"
+      - effect: NoExecute
+        operator: "Exists"
+      containers:
+      - name: node-cache
+        image: "{{ nodelocaldns_image_repo }}:{{ nodelocaldns_image_tag }}"
+        resources:
+          limits:
+            memory: {{ nodelocaldns_memory_limit }}
+          requests:
+            cpu: {{ nodelocaldns_cpu_requests }}
+            memory: {{ nodelocaldns_memory_requests }}
+        args: [ "-localip", "{{ nodelocaldns_ip }}", "-conf", "/etc/coredns/Corefile", "-upstreamsvc", "coredns", "-skipteardown" ]
+        securityContext:
+          privileged: true
+{% if nodelocaldns_bind_metrics_host_ip %}
+        env:
+          - name: MY_HOST_IP
+            valueFrom:
+              fieldRef:
+                fieldPath: status.hostIP
+{% endif %}
+        livenessProbe:
+          httpGet:
+            host: {{ nodelocaldns_ip }}
+            path: /health
+            port: {{ nodelocaldns_health_port }}
+            scheme: HTTP
+          timeoutSeconds: 5
+          successThreshold: 1
+          failureThreshold: 10
+        readinessProbe:
+          httpGet:
+            host: {{ nodelocaldns_ip }}
+            path: /health
+            port: {{ nodelocaldns_health_port }}
+            scheme: HTTP
+          timeoutSeconds: 5
+          successThreshold: 1
+          failureThreshold: 10
+        volumeMounts:
+        - name: config-volume
+          mountPath: /etc/coredns
+        - name: xtables-lock
+          mountPath: /run/xtables.lock
+        lifecycle:
+          preStop:
+            exec:
+              command:
+                - sh
+                - -c
+                - sleep {{ nodelocaldns_secondary_skew_seconds }} && kill -9 1
+      volumes:
+        - name: config-volume
+          configMap:
+            name: nodelocaldns
+            items:
+            - key: Corefile-second
+              path: Corefile
+{% if dns_etchosts | default(None) %}
+            - key: hosts
+              path: hosts
+{% endif %}
+        - name: xtables-lock
+          hostPath:
+            path: /run/xtables.lock
+            type: FileOrCreate
+      # Implement a time skew between the main nodelocaldns and this secondary.
+      # Since the two nodelocaldns instances share the :53 port, we want to keep
+      # at least one running at any time enven if the manifests are replaced simultaneously
+      terminationGracePeriodSeconds: {{ nodelocaldns_secondary_skew_seconds }}
+  updateStrategy:
+    rollingUpdate:
+      maxUnavailable: {{ serial | default('20%') }}
+    type: RollingUpdate
--- a/roles/kubespray-defaults/defaults/main.yaml
+++ b/roles/kubespray-defaults/defaults/main.yaml
@ -93,9 +93,12 @@ dns_mode: coredns

 # Enable nodelocal dns cache
 enable_nodelocaldns: true
+enable_nodelocaldns_secondary: false
 nodelocaldns_ip: 169.254.25.10
 nodelocaldns_health_port: 9254
+nodelocaldns_second_health_port: 9256
 nodelocaldns_bind_metrics_host_ip: false
+nodelocaldns_secondary_skew_seconds: 5

 # Should be set to a cluster IP if using a custom cluster DNS
 manual_dns_server: ""
--- a/tests/files/packet_centos8-calico-nodelocaldns-secondary.yml
+++ b/tests/files/packet_centos8-calico-nodelocaldns-secondary.yml
@ -0,0 +1,15 @@
+---
+# Instance settings
+cloud_image: centos-8
+mode: default
+vm_memory: 3072Mi
+
+# Kubespray settings
+kube_network_plugin: calico
+deploy_netchecker: true
+dns_min_replicas: 1
+enable_nodelocaldns_secondary: true
+loadbalancer_apiserver_type: haproxy
+
+# required
+calico_iptables_backend: "Auto"