Add graceful upgrade process

Based on #718 introduced by rsmitty. Includes all roles and all options to support deployment of new hosts in case they were added to inventory. Main difference here is that master role is evaluated first so that master components get upgraded first. Fixes #694
8 years ago · 97ebbb9672
5 changed files with 96 additions and 22 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -101,8 +101,8 @@ before_script:

    # Check out latest tag if testing upgrade
    # Uncomment when gitlab kargo repo has tags
-    #- test "${UPGRADE_TEST}" = "true" && git fetch --all && git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
-    - test "${UPGRADE_TEST}" = "true" && git checkout 031cf565ec3ccd3ebbe80eeef3454c3780e5c598 && pip install ansible==2.2.0
+    #- test "${UPGRADE_TEST}" != "false" && git fetch --all && git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
+    - test "${UPGRADE_TEST}" != "false" && git checkout 031cf565ec3ccd3ebbe80eeef3454c3780e5c598 && pip install ansible==2.2.0


    # Create cluster
@ -127,9 +127,10 @@ before_script:
      cluster.yml

    # Repeat deployment if testing upgrade
-    #FIXME(mattymo): repeat "Create cluster" above without duplicating code 
    - >
-      if [ "${UPGRADE_TEST}" = "true" ]; then 
+      if [ "${UPGRADE_TEST}" != "false" ]; then 
+      test "${UPGRADE_TEST}" == "basic" && PLAYBOOK="cluster.yml";
+      test "${UPGRADE_TEST}" == "graceful" && PLAYBOOK="upgrade-cluster.yml";
      pip install ansible==2.2.1.0; 
      git checkout "${CI_BUILD_REF}"; 
      ansible-playbook -i inventory/inventory.ini -b --become-user=root --private-key=${HOME}/.ssh/id_rsa -u $SSH_USER 
@ -149,7 +150,7 @@ before_script:
      -e resolvconf_mode=${RESOLVCONF_MODE} 
      -e weave_cpu_requests=${WEAVE_CPU_LIMIT} 
      -e weave_cpu_limit=${WEAVE_CPU_LIMIT} 
-      cluster.yml; 
+      $PLAYBOOK; 
      fi

    # Tests Cases
@ -253,7 +254,7 @@ before_script:
  KUBE_NETWORK_PLUGIN: canal
  CLOUD_IMAGE: debian-8-kubespray
  CLOUD_REGION: us-east1-b
-  UPGRADE_TEST: "true"
+  UPGRADE_TEST: "basic"
  CLUSTER_MODE: ha

 .rhel7_weave_variables: &rhel7_weave_variables
@ -261,7 +262,7 @@ before_script:
  KUBE_NETWORK_PLUGIN: weave
  CLOUD_IMAGE: rhel-7
  CLOUD_REGION: europe-west1-b
-  UPGRADE_TEST: "true"
+  UPGRADE_TEST: "graceful"
  CLUSTER_MODE: default

 .centos7_flannel_variables: &centos7_flannel_variables
--- a/docs/upgrades.md
+++ b/docs/upgrades.md
@ -18,7 +18,7 @@ versions. Here are all version vars for each component:
 * flannel_version
 * kubedns_version

-#### Example
+#### Unsafe upgrade example

 If you wanted to upgrade just kube_version from v1.4.3 to v1.4.6, you could
 deploy the following way:
@ -33,6 +33,20 @@ And then repeat with v1.4.6 as kube_version:
 ansible-playbook cluster.yml -i inventory/inventory.cfg -e kube_version=v1.4.6
 ```

+#### Graceful upgrade
+
+Kargo also supports cordon, drain and uncordoning of nodes when performing 
+a cluster upgrade. There is a separate playbook used for this purpose. It is
+important to note that upgrade-cluster.yml can only be used for upgrading an
+existing cluster. That means there must be at least 1 kube-master already
+deployed.
+
+```
+git fetch origin
+git checkout origin/master
+ansible-playbook upgrade-cluster cluster.yml -i inventory/inventory.cfg
+```
+
 #### Upgrade order

 As mentioned above, components are upgraded in the order in which they were
--- a/roles/upgrade/post-upgrade/tasks/main.yml
+++ b/roles/upgrade/post-upgrade/tasks/main.yml
@ -1,5 +1,5 @@
 ---

 - name: Uncordon node
-  command: kubectl uncordon {{ ansible_hostname }}
+  command: "{{ bin_dir }}/kubectl uncordon {{ ansible_hostname }}"
  delegate_to: "{{ groups['kube-master'][0] }}"
--- a/roles/upgrade/pre-upgrade/tasks/main.yml
+++ b/roles/upgrade/pre-upgrade/tasks/main.yml
@ -1,11 +1,11 @@
 ---

 - name: Cordon node
-  command: kubectl cordon {{ ansible_hostname }}
+  command: "{{ bin_dir }}/kubectl cordon {{ ansible_hostname }}"
  delegate_to: "{{ groups['kube-master'][0] }}"

 - name: Drain node
-  command: kubectl drain --force --ignore-daemonsets --grace-period 30 --delete-local-data {{ ansible_hostname }}
+  command: "{{ bin_dir }}/kubectl drain --force --ignore-daemonsets --grace-period 30 --delete-local-data {{ ansible_hostname }}"
  delegate_to: "{{ groups['kube-master'][0] }}"

 - name: Sleep for grace period for draining
--- a/upgrade-cluster.yml
+++ b/upgrade-cluster.yml
@ -1,33 +1,92 @@
 ---
- hosts: all
+- hosts: localhost
+  gather_facts: False
+  roles:
+    - bastion-ssh-config
+  tags: [localhost, bastion]
+
+- hosts: k8s-cluster:etcd:calico-rr
+  any_errors_fatal: true
+  gather_facts: false
+  vars:
+    # Need to disable pipelining for bootstrap-os as some systems have requiretty in sudoers set, which makes pipelining
+    # fail. bootstrap-os fixes this on these systems, so in later plays it can be enabled.
+    ansible_ssh_pipelining: false
+  roles:
+    - bootstrap-os
+  tags:
+    - bootstrap-os
+
+- hosts: k8s-cluster:etcd:calico-rr
  any_errors_fatal: true
+  vars:
+    ansible_ssh_pipelining: true
  gather_facts: true

- hosts: all:!network-storage
+- hosts: k8s-cluster:etcd:calico-rr
  any_errors_fatal: true
  roles:
+    - { role: kernel-upgrade, tags: kernel-upgrade, when: kernel_upgrade is defined and kernel_upgrade }
    - { role: kubernetes/preinstall, tags: preinstall }
+    - { role: docker, tags: docker }
+    - role: rkt
+      tags: rkt
+      when: "'rkt' in [etcd_deployment_type, kubelet_deployment_type, vault_deployment_type]"
+
+- hosts: etcd:k8s-cluster:vault
+  any_errors_fatal: true
+  roles:
+    - { role: vault, tags: vault, vault_bootstrap: true, when: "cert_management == 'vault'" }

 - hosts: etcd:!k8s-cluster
  any_errors_fatal: true
-  serial: 1
  roles:
    - { role: etcd, tags: etcd }

- hosts: kube-node
+- hosts: k8s-cluster
  any_errors_fatal: true
-  serial: 1
  roles:
    - { role: etcd, tags: etcd }
-    - { role: upgrade/pre-upgrade, tags: upgrade/pre-upgrade }
-    - { role: kubernetes/node, tags: node }
-    - { role: network_plugin, tags: network }
-    - { role: upgrade/post-upgrade, tags: upgrade/post-upgrade }

+- hosts: etcd:k8s-cluster:vault
+  any_errors_fatal: true
+  roles:
+    - { role: vault, tags: vault, when: "cert_management == 'vault'"}
+
+#Handle upgrades to master components first to maintain backwards compat.
 - hosts: kube-master
  any_errors_fatal: true
  serial: 1
  roles:
-    - { role: etcd, tags: etcd }
+    - { role: upgrade/pre-upgrade, tags: pre-upgrade }
    - { role: kubernetes/node, tags: node }
-    - { role: kubernetes/master, tags: master }
+    - { role: kubernetes/master, tags: master }
+    - { role: network_plugin, tags: network }
+    - { role: upgrade/post-upgrade, tags: post-upgrade }
+
+#Finally handle worker upgrades, based on given batch size
+- hosts: kube-node:!kube-master
+  any_errors_fatal: true
+  serial: "{{ serial | default('20%') }}"
+  roles:
+    - { role: upgrade/pre-upgrade, tags: pre-upgrade }
+    - { role: kubernetes/node, tags: node }
+    - { role: network_plugin, tags: network }
+    - { role: upgrade/post-upgrade, tags: post-upgrade }
+    - { role: kubernetes-apps/network_plugin, tags: network }
+
+- hosts: calico-rr
+  any_errors_fatal: true
+  roles:
+    - { role: network_plugin/calico/rr, tags: network }
+
+- hosts: k8s-cluster
+  any_errors_fatal: true
+  roles:
+    - { role: dnsmasq, when: "dns_mode == 'dnsmasq_kubedns'", tags: dnsmasq }
+    - { role: kubernetes/preinstall, when: "dns_mode != 'none' and resolvconf_mode == 'host_resolvconf'", tags: resolvconf }
+
+- hosts: kube-master[0]
+  any_errors_fatal: true
+  roles:
+    - { role: kubernetes-apps, tags: apps }