Browse Source

Upgrade to kubeadm (#1667)

* Enable upgrade to kubeadm

* fix kubedns upgrade

* try upgrade route

* use init/upgrade strategy for kubeadm and ignore kubedns svc

* Use bin_dir for kubeadm

* delete more secrets

* fix waiting for terminating pods

* Manually enforce kube-proxy for kubeadm deploy

* remove proxy. update to kubeadm 1.8.0rc1
pull/1681/merge
Matthew Mosesohn 7 years ago
committed by GitHub
parent
commit
bd272e0b3c
17 changed files with 210 additions and 42 deletions
  1. 35
      .gitlab-ci.yml
  2. 1
      cluster.yml
  3. 14
      docs/upgrades.md
  4. 6
      roles/download/defaults/main.yml
  5. 18
      roles/kubernetes-apps/ansible/tasks/main.yml
  6. 11
      roles/kubernetes-apps/network_plugin/weave/tasks/main.yml
  7. 20
      roles/kubernetes-apps/rotate_tokens/tasks/main.yml
  8. 2
      roles/kubernetes/kubeadm/tasks/main.yml
  9. 3
      roles/kubernetes/master/tasks/kubeadm-cleanup-old-certs.yml
  10. 12
      roles/kubernetes/master/tasks/kubeadm-migrate-certs.yml
  11. 93
      roles/kubernetes/master/tasks/kubeadm-setup.yml
  12. 3
      roles/kubernetes/master/templates/kubeadm-config.yaml.j2
  13. 10
      roles/kubernetes/node/tasks/install.yml
  14. 7
      roles/kubernetes/node/tasks/main.yml
  15. 1
      roles/reset/tasks/main.yml
  16. 14
      tests/testcases/030_check-network.yml
  17. 2
      upgrade-cluster.yml

35
.gitlab-ci.yml

@ -299,12 +299,24 @@ before_script:
UPGRADE_TEST: "graceful"
STARTUP_SCRIPT: ""
.centos_weave_kubeadm_variables: &centos_weave_kubeadm_variables
# stage: deploy-gce-part1
KUBE_NETWORK_PLUGIN: weave
AUTHORIZATION_MODES: "{ 'authorization_modes': [ 'RBAC' ] }"
CLOUD_IMAGE: centos-7
CLOUD_MACHINE_TYPE: "n1-standard-1"
CLOUD_REGION: us-central1-b
CLUSTER_MODE: ha
KUBEADM_ENABLED: "true"
UPGRADE_TEST: "graceful"
STARTUP_SCRIPT: ""
.ubuntu_canal_kubeadm_variables: &ubuntu_canal_kubeadm_variables
# stage: deploy-gce-part1
KUBE_NETWORK_PLUGIN: canal
AUTHORIZATION_MODES: "{ 'authorization_modes': [ 'RBAC' ] }"
CLOUD_IMAGE: ubuntu-1604-xenial
CLOUD_MACHINE_TYPE: "n1-standard-2"
CLOUD_MACHINE_TYPE: "n1-standard-1"
CLOUD_REGION: europe-west1-b
CLUSTER_MODE: ha
KUBEADM_ENABLED: "true"
@ -521,6 +533,27 @@ ubuntu-canal-kubeadm-triggers:
when: on_success
only: ['triggers']
centos-weave-kubeadm-rbac:
stage: deploy-gce-part1
<<: *job
<<: *gce
variables:
<<: *gce_variables
<<: *centos_weave_kubeadm_variables
when: manual
except: ['triggers']
only: ['master', /^pr-.*$/]
centos-weave-kubeadm-triggers:
stage: deploy-gce-part1
<<: *job
<<: *gce
variables:
<<: *gce_variables
<<: *centos_weave_kubeadm_variables
when: on_success
only: ['triggers']
rhel7-weave:
stage: deploy-gce-part1
<<: *job

1
cluster.yml

@ -80,6 +80,7 @@
any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
roles:
- { role: kubespray-defaults}
- { role: kubernetes-apps/rotate_tokens, tags: rotate_tokens, when: "secret_changed|default(false)" }
- { role: kubernetes-apps/network_plugin, tags: network }
- { role: kubernetes-apps/policy_controller, tags: policy-controller }
- { role: kubernetes/client, tags: client }

14
docs/upgrades.md

@ -67,3 +67,17 @@ follows:
* network_plugin (such as Calico or Weave)
* kube-apiserver, kube-scheduler, and kube-controller-manager
* Add-ons (such as KubeDNS)
#### Upgrade considerations
Kubespray supports rotating certificates used for etcd and Kubernetes
components, but some manual steps may be required. If you have a pod that
requires use of a service token and is deployed in a namespace other than
`kube-system`, you will need to manually delete the affected pods after
rotating certificates. This is because all service account tokens are dependent
on the apiserver token that is used to generate them. When the certificate
rotates, all service account tokens must be rotated as well. During the
kubernetes-apps/rotate_tokens role, only pods in kube-system are destroyed and
recreated. All other invalidated service account tokens are cleaned up
automatically, but other pods are not deleted out of an abundance of caution
for impact to user deployed pods.

6
roles/download/defaults/main.yml

@ -20,7 +20,7 @@ download_always_pull: False
# Versions
kube_version: v1.7.5
# Change to kube_version after v1.8.0 release
kubeadm_version: "v1.8.0-beta.1"
kubeadm_version: "v1.8.0-rc.1"
etcd_version: v3.2.4
# TODO(mattymo): Move calico versions to roles/network_plugins/calico/defaults
# after migration to container download
@ -37,7 +37,7 @@ pod_infra_version: 3.0
kubeadm_download_url: "https://storage.googleapis.com/kubernetes-release/release/{{ kubeadm_version }}/bin/linux/amd64/kubeadm"
# Checksums
kubeadm_checksum: "ddd5949699d6bdbc0b90b379e7e534f137b1058db1acc8f26cc54843f017ffbf"
kubeadm_checksum: "8f6ceb26b8503bfc36a99574cf6f853be1c55405aa31669561608ad8099bf5bf"
# Containers
etcd_image_repo: "quay.io/coreos/etcd"
@ -123,7 +123,7 @@ downloads:
container: true
repo: "{{ etcd_image_repo }}"
tag: "{{ etcd_image_tag }}"
sha256: "{{etcd_digest_checksum|default(None)}}"
sha256: "{{ etcd_digest_checksum|default(None) }}"
kubeadm:
version: "{{ kubeadm_version }}"
dest: "kubeadm"

18
roles/kubernetes-apps/ansible/tasks/main.yml

@ -8,7 +8,17 @@
delay: 6
when: inventory_hostname == groups['kube-master'][0]
- name: kubeadm | Delete kubeadm kubedns
- name: Kubernetes Apps | Delete old kubedns resources
kube:
name: "kubedns"
namespace: "{{ system_namespace }}"
kubectl: "{{bin_dir}}/kubectl"
resource: "{{ item }}"
state: absent
with_items: ['deploy', 'svc']
tags: upgrade
- name: Kubernetes Apps | Delete kubeadm kubedns
kube:
name: "kubedns"
namespace: "{{ system_namespace }}"
@ -25,9 +35,9 @@
src: "{{item.file}}"
dest: "{{kube_config_dir}}/{{item.file}}"
with_items:
- {name: kubedns, file: kubedns-sa.yml, type: sa}
- {name: kubedns, file: kubedns-deploy.yml.j2, type: deployment}
- {name: kubedns, file: kubedns-svc.yml, type: svc}
- {name: kube-dns, file: kubedns-sa.yml, type: sa}
- {name: kube-dns, file: kubedns-deploy.yml.j2, type: deployment}
- {name: kube-dns, file: kubedns-svc.yml, type: svc}
- {name: kubedns-autoscaler, file: kubedns-autoscaler-sa.yml, type: sa}
- {name: kubedns-autoscaler, file: kubedns-autoscaler-clusterrole.yml, type: clusterrole}
- {name: kubedns-autoscaler, file: kubedns-autoscaler-clusterrolebinding.yml, type: clusterrolebinding}

11
roles/kubernetes-apps/network_plugin/weave/tasks/main.yml

@ -1,15 +1,4 @@
---
# FIXME: remove if kubernetes/features#124 is implemented
- name: Weave | Purge old weave daemonset
kube:
name: "weave-net"
kubectl: "{{ bin_dir }}/kubectl"
filename: "{{ kube_config_dir }}/weave-net.yml"
resource: "ds"
namespace: "{{system_namespace}}"
state: absent
when: inventory_hostname == groups['kube-master'][0] and weave_manifest.changed
- name: Weave | Start Resources
kube:
name: "weave-net"

20
roles/kubernetes-apps/rotate_tokens/tasks/main.yml

@ -0,0 +1,20 @@
---
#FIXME(mattymo): Exclude built in secrets that were automatically rotated,
#instead of filtering manually
- name: Rotate Tokens | Get all serviceaccount tokens to expire
shell: >-
{{ bin_dir }}/kubectl get secrets --all-namespaces
-o 'jsonpath={range .items[*]}{"\n"}{.metadata.namespace}{" "}{.metadata.name}{" "}{.type}{end}'
| grep kubernetes.io/service-account-token
| egrep 'default-token|kube-proxy|kube-dns|dnsmasq|netchecker|weave|calico|canal|flannel|dashboard|cluster-proportional-autoscaler|efk|tiller'
register: tokens_to_delete
run_once: true
- name: Rotate Tokens | Delete expired tokens
command: "{{ bin_dir }}/kubectl delete secrets -n {{ item.split(' ')[0] }} {{ item.split(' ')[1] }}"
with_items: "{{ tokens_to_delete.stdout_lines }}"
run_once: true
- name: Rotate Tokens | Delete pods in system namespace
command: "{{ bin_dir }}/kubectl delete pods -n {{ system_namespace }} --all"
run_once: true

2
roles/kubernetes/kubeadm/tasks/main.yml

@ -24,7 +24,7 @@
register: kubeadm_client_conf
- name: Join to cluster if needed
command: kubeadm join --config {{ kube_config_dir}}/kubeadm-client.conf --skip-preflight-checks
command: "{{ bin_dir }}/kubeadm join --config {{ kube_config_dir}}/kubeadm-client.conf --skip-preflight-checks"
register: kubeadm_join
when: not is_kube_master and (kubeadm_client_conf.changed or not kubelet_conf.stat.exists)

3
roles/kubernetes/master/tasks/kubeadm-cleanup-old-certs.yml

@ -0,0 +1,3 @@
---
- name: kubeadm | Purge old certs
command: "rm -f {{kube_cert_dir }}/*.pem"

12
roles/kubernetes/master/tasks/kubeadm-migrate-certs.yml

@ -0,0 +1,12 @@
---
- name: Copy old certs to the kubeadm expected path
copy:
src: "{{ kube_cert_dir }}/{{ item.src }}"
dest: "{{ kube_cert_dir }}/{{ item.dest }}"
remote_src: yes
with_items:
- {src: apiserver.pem, dest: apiserver.crt}
- {src: apiserver.pem, dest: apiserver.key}
- {src: ca.pem, dest: ca.crt}
- {src: ca-key.pem, dest: ca.key}
register: kubeadm_copy_old_certs

93
roles/kubernetes/master/tasks/kubeadm-setup.yml

@ -1,4 +1,35 @@
---
- name: kubeadm | Check if old apiserver cert exists on host
stat:
path: "{{ kube_cert_dir }}/apiserver.pem"
register: old_apiserver_cert
delegate_to: "{{groups['kube-master']|first}}"
run_once: true
- name: kubeadm | Check service account key
stat:
path: "{{ kube_cert_dir }}/sa.key"
register: sa_key_before
delegate_to: "{{groups['kube-master']|first}}"
run_once: true
- name: kubeadm | Check if kubeadm has already run
stat:
path: "{{ kube_config_dir }}/admin.conf"
register: admin_conf
- name: kubeadm | Delete old static pods
file:
path: "{{ kube_config_dir }}/manifests/{{item}}.manifest"
state: absent
with_items: ["kube-apiserver", "kube-controller-manager", "kube-scheduler", "kube-proxy"]
when: old_apiserver_cert.stat.exists
- name: kubeadm | Forcefully delete old static pods
shell: "docker ps -f name=k8s_{{item}} -q | xargs --no-run-if-empty docker rm -f"
with_items: ["kube-apiserver", "kube-controller-manager", "kube-scheduler"]
when: old_apiserver_cert.stat.exists
- name: kubeadm | aggregate all SANs
set_fact:
apiserver_sans: >-
@ -29,18 +60,29 @@
dest: "{{ kube_config_dir }}/kubeadm-config.yaml"
register: kubeadm_config
- name: Check if kubeadm has already run
stat:
path: "{{ kube_config_dir }}/admin.conf"
register: admin_conf
- name: kubeadm | Initialize first master
command: timeout -k 240s 240s kubeadm init --config={{ kube_config_dir }}/kubeadm-config.yaml --skip-preflight-checks
command: timeout -k 240s 240s {{ bin_dir }}/kubeadm init --config={{ kube_config_dir }}/kubeadm-config.yaml --skip-preflight-checks
register: kubeadm_init
#Retry is because upload config sometimes fails
retries: 3
when: inventory_hostname == groups['kube-master']|first and (kubeadm_config.changed or not admin_conf.stat.exists)
when: inventory_hostname == groups['kube-master']|first and not admin_conf.stat.exists
failed_when: kubeadm_init.rc != 0 and "field is immutable" not in kubeadm_init.stderr
notify: Master | restart kubelet
- name: kubeadm | Upgrade first master
command: timeout -k 240s 240s {{ bin_dir }}/kubeadm upgrade apply --config={{ kube_config_dir }}/kubeadm-config.yaml {{ kube_version }} --skip-preflight-checks
register: kubeadm_upgrade
#Retry is because upload config sometimes fails
retries: 3
when: inventory_hostname == groups['kube-master']|first and (kubeadm_config.changed and admin_conf.stat.exists)
failed_when: kubeadm_upgrade.rc != 0 and "field is immutable" not in kubeadm_upgrade.stderr
notify: Master | restart kubelet
# FIXME(mattymo): remove when https://github.com/kubernetes/kubeadm/issues/433 is fixed
- name: kubeadm | Enable kube-proxy
command: "{{ bin_dir }}/kubeadm alpha phase addon kube-proxy --config={{ kube_config_dir }}/kubeadm-config.yaml"
when: inventory_hostname == groups['kube-master']|first
changed_when: false
- name: slurp kubeadm certs
slurp:
@ -62,7 +104,7 @@
delegate_to: "{{ groups['kube-master']|first }}"
run_once: true
- name: write out kubeadm certs
- name: kubeadm | write out kubeadm certs
copy:
dest: "{{ item.item }}"
content: "{{ item.content | b64decode }}"
@ -74,9 +116,32 @@
with_items: "{{ kubeadm_certs.results }}"
when: inventory_hostname != groups['kube-master']|first
- name: kubeadm | Initialize other masters
command: timeout -k 240s 240s kubeadm init --config={{ kube_config_dir }}/kubeadm-config.yaml --skip-preflight-checks
- name: kubeadm | Init other uninitialized masters
command: timeout -k 240s 240s {{ bin_dir }}/kubeadm init --config={{ kube_config_dir }}/kubeadm-config.yaml --skip-preflight-checks
register: kubeadm_init
#Retry is because upload config sometimes fails
retries: 3
when: inventory_hostname != groups['kube-master']|first and (kubeadm_config.changed or not admin_conf.stat.exists or copy_kubeadm_certs.changed)
when: inventory_hostname != groups['kube-master']|first and not admin_conf.stat.exists
failed_when: kubeadm_init.rc != 0 and "field is immutable" not in kubeadm_init.stderr
notify: Master | restart kubelet
- name: kubeadm | Upgrade other masters
command: timeout -k 240s 240s {{ bin_dir }}/kubeadm upgrade apply --config={{ kube_config_dir }}/kubeadm-config.yaml {{ kube_version }} --skip-preflight-checks
register: kubeadm_upgrade
when: inventory_hostname != groups['kube-master']|first and (kubeadm_config.changed and admin_conf.stat.exists)
failed_when: kubeadm_upgrade.rc != 0 and "field is immutable" not in kubeadm_upgrade.stderr
notify: Master | restart kubelet
- name: kubeadm | Check service account key again
stat:
path: "{{ kube_cert_dir }}/sa.key"
register: sa_key_after
delegate_to: "{{groups['kube-master']|first}}"
run_once: true
- name: kubeadm | Set secret_changed if service account key was updated
command: /bin/true
notify: Master | set secret_changed
when: sa_key_before.stat.checksum|default("") != sa_key_after.stat.checksum
- name: kubeadm | cleanup old certs if necessary
include: kubeadm-cleanup-old-certs.yml
when: old_apiserver_cert.stat.exists

3
roles/kubernetes/master/templates/kubeadm-config.yaml.j2

@ -17,7 +17,6 @@ networking:
podSubnet: {{ kube_pods_subnet }}
kubernetesVersion: {{ kube_version }}
cloudProvider: {{ cloud_provider|default('') }}
#TODO: cloud provider conf file
authorizationModes:
- Node
{% for mode in authorization_modes %}
@ -53,7 +52,6 @@ apiServerExtraArgs:
runtime-config: {{ kube_api_runtime_config }}
{% endif %}
allow-privileged: "true"
#TODO: Custom flags compatible with kubeadm
controllerManagerExtraArgs:
node-monitor-grace-period: {{ kube_controller_node_monitor_grace_period }}
node-monitor-period: {{ kube_controller_node_monitor_period }}
@ -61,7 +59,6 @@ controllerManagerExtraArgs:
{% if kube_feature_gates %}
feature-gates: {{ kube_feature_gates|join(',') }}
{% endif %}
#schedulerExtraArgs:
apiServerCertSANs:
{% for san in apiserver_sans.split(' ') | unique %}
- {{ san }}

10
roles/kubernetes/node/tasks/install.yml

@ -19,12 +19,20 @@
when: kubeadm_enabled
tags: kubeadm
- name: install | Copy binary from download dir
- name: install | Copy kubeadm binary from download dir
command: rsync -piu "{{ local_release_dir }}/kubeadm" "{{ bin_dir }}/kubeadm"
changed_when: false
when: kubeadm_enabled
tags: kubeadm
- name: install | Set kubeadm binary permissions
file:
path: "{{ bin_dir }}/kubeadm"
mode: "0755"
state: file
when: kubeadm_enabled
tags: kubeadm
- include: "install_{{ kubelet_deployment_type }}.yml"
- name: install | Write kubelet systemd init file

7
roles/kubernetes/node/tasks/main.yml

@ -96,6 +96,13 @@
when: not kubeadm_enabled
tags: kube-proxy
- name: Purge proxy manifest for kubeadm
file:
path: "{{ kube_manifest_dir }}/kube-proxy.manifest"
state: absent
when: kubeadm_enabled
tags: kube-proxy
# reload-systemd
- meta: flush_handlers

1
roles/reset/tasks/main.yml

@ -75,6 +75,7 @@
with_items:
- "{{kube_config_dir}}"
- /var/lib/kubelet
- /root/.kube
- "{{ etcd_data_dir }}"
- /etc/ssl/etcd
- /var/log/calico

14
tests/testcases/030_check-network.yml

@ -13,14 +13,22 @@
when: not ansible_os_family in ["CoreOS", "Container Linux by CoreOS"]
- name: Get pod names
shell: "{{bin_dir}}/kubectl get pods -o json"
- name: Wait for pods to be ready
shell: "{{bin_dir}}/kubectl get pods"
register: pods
until: '"ContainerCreating" not in pods.stdout and "Terminating" not in pods.stdout'
until:
- '"ContainerCreating" not in pods.stdout'
- '"Pending" not in pods.stdout'
- '"Terminating" not in pods.stdout'
retries: 60
delay: 2
no_log: true
- name: Get pod names
shell: "{{bin_dir}}/kubectl get pods -o json"
register: pods
no_log: true
- name: Get hostnet pods
command: "{{bin_dir}}/kubectl get pods -o
jsonpath='{range .items[?(.spec.hostNetwork)]}{.metadata.name} {.status.podIP} {.status.containerStatuses} {end}'"

2
upgrade-cluster.yml

@ -67,7 +67,6 @@
- { role: kubernetes/node, tags: node }
- { role: kubernetes/master, tags: master }
- { role: network_plugin, tags: network }
- { role: kubernetes/kubeadm, tags: kubeadm, when: "kubeadm_enabled" }
- { role: upgrade/post-upgrade, tags: post-upgrade }
#Finally handle worker upgrades, based on given batch size
@ -87,6 +86,7 @@
any_errors_fatal: true
roles:
- { role: kubespray-defaults}
- { role: kubernetes-apps/rotate_tokens, tags: rotate_tokens, when: "secret_changed|default(false)" }
- { role: kubernetes-apps/network_plugin, tags: network }
- { role: kubernetes-apps/policy_controller, tags: policy-controller }
- { role: kubernetes/client, tags: client }

Loading…
Cancel
Save