Browse Source
Fix recover-control-plane to work with etcd 3.3.x and add CI (#5500)
Fix recover-control-plane to work with etcd 3.3.x and add CI (#5500)
* Fix recover-control-plane to work with etcd 3.3.x and add CI * Set default values for testcase * Add actual test jobs * Attempt to satisty gitlab ci linter * Fix ansible targets * Set etcd_member_name as stated in the docs... * Recovering from 0 masters is not supported yet * Add other master to broken_kube-master group as well * Increase number of retries to see if etcd needs more time to heal * Make number of retries for ETCD loops configurable, increase it for recovery CI and document itpull/5626/head
committed by
GitHub
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 204 additions and 134 deletions
Split View
Diff Options
-
2.gitlab-ci.yml
-
16.gitlab-ci/packet.yml
-
30docs/recover-control-plane.md
-
1recover-control-plane.yml
-
3roles/etcd/defaults/main.yml
-
4roles/etcd/tasks/configure.yml
-
2roles/etcd/tasks/install_docker.yml
-
2roles/etcd/tasks/join_etcd-events_member.yml
-
2roles/etcd/tasks/join_etcd_member.yml
-
77roles/recover_control_plane/etcd/tasks/main.yml
-
48roles/recover_control_plane/etcd/tasks/prepare.yml
-
10roles/recover_control_plane/etcd/tasks/recover_lost_quorum.yml
-
17roles/recover_control_plane/master/tasks/main.yml
-
2roles/recover_control_plane/pre-recover/defaults/main.yml
-
36roles/recover_control_plane/pre-recover/tasks/main.yml
-
2tests/cloud_playbooks/roles/packet-ci/tasks/main.yml
-
39tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2
-
10tests/files/packet_ubuntu18-calico-ha-recover-noquorum.yml
-
10tests/files/packet_ubuntu18-calico-ha-recover.yml
-
6tests/scripts/testcases_run.sh
-
6tests/templates/inventory-aws.j2
-
6tests/templates/inventory-do.j2
-
7tests/templates/inventory-gce.j2
@ -1,7 +1,78 @@ |
|||
--- |
|||
- include_tasks: prepare.yml |
|||
- name: Get etcd endpoint health |
|||
shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem --endpoints={{ etcd_access_addresses }} endpoint health" |
|||
register: etcd_endpoint_health |
|||
ignore_errors: true |
|||
changed_when: false |
|||
check_mode: no |
|||
environment: |
|||
- ETCDCTL_API: 3 |
|||
when: |
|||
- groups['broken_etcd'] |
|||
|
|||
- name: Set healthy fact |
|||
set_fact: |
|||
healthy: "{{ etcd_endpoint_health.stderr | match('Error: unhealthy cluster') }}" |
|||
when: |
|||
- groups['broken_etcd'] |
|||
|
|||
- name: Set has_quorum fact |
|||
set_fact: |
|||
has_quorum: "{{ etcd_endpoint_health.stdout_lines | select('match', '.*is healthy.*') | list | length >= etcd_endpoint_health.stderr_lines | select('match', '.*is unhealthy.*') | list | length }}" |
|||
|
|||
- include_tasks: recover_lost_quorum.yml |
|||
when: |
|||
- has_etcdctl |
|||
- not etcd_cluster_is_healthy |
|||
- groups['broken_etcd'] |
|||
- not has_quorum |
|||
|
|||
- name: Remove etcd data dir |
|||
file: |
|||
path: "{{ etcd_data_dir }}" |
|||
state: absent |
|||
delegate_to: "{{ item }}" |
|||
with_items: "{{ groups['broken_etcd'] }}" |
|||
when: |
|||
- groups['broken_etcd'] |
|||
- has_quorum |
|||
|
|||
- name: Delete old certificates |
|||
# noqa 302 - rm is ok here for now |
|||
shell: "rm {{ etcd_cert_dir }}/*{{ item }}*" |
|||
with_items: "{{ groups['broken_etcd'] }}" |
|||
register: delete_old_cerificates |
|||
ignore_errors: true |
|||
when: groups['broken_etcd'] |
|||
|
|||
- name: Fail if unable to delete old certificates |
|||
fail: |
|||
msg: "Unable to delete old certificates for: {{ item.item }}" |
|||
loop: "{{ delete_old_cerificates.results }}" |
|||
changed_when: false |
|||
when: |
|||
- groups['broken_etcd'] |
|||
- "item.rc != 0 and not 'No such file or directory' in item.stderr" |
|||
|
|||
- name: Get etcd cluster members |
|||
shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem member list" |
|||
register: member_list |
|||
changed_when: false |
|||
check_mode: no |
|||
environment: |
|||
- ETCDCTL_API: 3 |
|||
when: |
|||
- groups['broken_etcd'] |
|||
- not healthy |
|||
- has_quorum |
|||
|
|||
- name: Remove broken cluster members |
|||
shell: "{{ bin_dir }}/etcdctl --cacert {{ etcd_cert_dir }}/ca.pem --cert {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem --key {{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem --endpoints={{ etcd_access_addresses }} member remove {{ item[1].replace(' ','').split(',')[0] }}" |
|||
environment: |
|||
- ETCDCTL_API: 3 |
|||
with_nested: |
|||
- "{{ groups['broken_etcd'] }}" |
|||
- "{{ member_list.stdout_lines }}" |
|||
when: |
|||
- groups['broken_etcd'] |
|||
- not healthy |
|||
- has_quorum |
|||
- hostvars[item[0]]['etcd_member_name'] == item[1].replace(' ','').split(',')[2] |
@ -1,48 +0,0 @@ |
|||
--- |
|||
- name: Delete old certificates |
|||
# noqa 302 - rm is ok here for now |
|||
shell: "rm /etc/ssl/etcd/ssl/*{{ item }}* /etc/kubernetes/ssl/etcd/*{{ item }}*" |
|||
with_items: "{{ old_etcds.split(',') }}" |
|||
register: delete_old_cerificates |
|||
ignore_errors: true |
|||
when: old_etcds is defined |
|||
|
|||
- name: Fail if unable to delete old certificates |
|||
fail: |
|||
msg: "Unable to delete old certificates for: {{ item.item }}" |
|||
loop: "{{ delete_old_cerificates.results }}" |
|||
changed_when: false |
|||
when: |
|||
- old_etcds is defined |
|||
- "item.rc != 0 and not 'No such file or directory' in item.stderr" |
|||
|
|||
- name: Get etcd cluster members |
|||
shell: "{{ bin_dir }}/etcdctl member list" |
|||
register: member_list |
|||
changed_when: false |
|||
check_mode: no |
|||
environment: |
|||
- ETCDCTL_API: 3 |
|||
- ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem |
|||
- ETCDCTL_CERT: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}.pem" |
|||
- ETCDCTL_KEY: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}-key.pem" |
|||
when: |
|||
- has_etcdctl |
|||
- etcd_cluster_is_healthy |
|||
- old_etcd_members is defined |
|||
|
|||
- name: Remove old cluster members |
|||
shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} member remove {{ item[1].replace(' ','').split(',')[0] }}" |
|||
environment: |
|||
- ETCDCTL_API: 3 |
|||
- ETCDCTL_CA_FILE: /etc/ssl/etcd/ssl/ca.pem |
|||
- ETCDCTL_CERT: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}.pem" |
|||
- ETCDCTL_KEY: "/etc/ssl/etcd/ssl/admin-{{ inventory_hostname }}-key.pem" |
|||
with_nested: |
|||
- "{{ old_etcd_members.split(',') }}" |
|||
- "{{ member_list.stdout_lines }}" |
|||
when: |
|||
- has_etcdctl |
|||
- etcd_cluster_is_healthy |
|||
- old_etcd_members is defined |
|||
- item[0] == item[1].replace(' ','').split(',')[2] |
@ -1,2 +0,0 @@ |
|||
--- |
|||
control_plane_is_converged: "{{ groups['etcd'] | sort == groups['kube-master'] | sort | bool }}" |
@ -1,36 +0,0 @@ |
|||
--- |
|||
- name: Check for etcdctl binary |
|||
raw: "test -e {{ bin_dir }}/etcdctl" |
|||
register: test_etcdctl |
|||
|
|||
- name: Set has_etcdctl fact |
|||
set_fact: |
|||
has_etcdctl: "{{ test_etcdctl.rc == 0 | bool }}" |
|||
|
|||
- name: Check if etcd cluster is healthy |
|||
shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'" |
|||
register: etcd_cluster_health |
|||
ignore_errors: true |
|||
changed_when: false |
|||
check_mode: no |
|||
environment: |
|||
ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem" |
|||
ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem" |
|||
ETCDCTL_CA_FILE: "{{ etcd_cert_dir }}/ca.pem" |
|||
when: has_etcdctl |
|||
|
|||
- name: Set etcd_cluster_is_healthy fact |
|||
set_fact: |
|||
etcd_cluster_is_healthy: "{{ etcd_cluster_health.rc == 0 | bool }}" |
|||
|
|||
- name: Abort if etcd cluster is healthy and old_etcd_members is undefined |
|||
assert: |
|||
that: "{{ old_etcd_members is defined }}" |
|||
msg: "'old_etcd_members' must be defined when the etcd cluster has quorum." |
|||
when: etcd_cluster_is_healthy |
|||
|
|||
- name: Warn for untested recovery |
|||
debug: |
|||
msg: Control plane recovery of split control planes is UNTESTED! Abort or continue at your own risk. |
|||
delay: 30 |
|||
when: not control_plane_is_converged |
@ -0,0 +1,10 @@ |
|||
--- |
|||
# Instance settings |
|||
cloud_image: ubuntu-1804 |
|||
mode: ha-recover-noquorum |
|||
vm_memory: 1600Mi |
|||
|
|||
# Kubespray settings |
|||
kube_network_plugin: calico |
|||
deploy_netchecker: true |
|||
dns_min_replicas: 1 |
@ -0,0 +1,10 @@ |
|||
--- |
|||
# Instance settings |
|||
cloud_image: ubuntu-1804 |
|||
mode: ha-recover |
|||
vm_memory: 1600Mi |
|||
|
|||
# Kubespray settings |
|||
kube_network_plugin: calico |
|||
deploy_netchecker: true |
|||
dns_min_replicas: 1 |
Write
Preview
Loading…
Cancel
Save