From 0f0e24be0fa77615896168b251cf470d8d9a952b Mon Sep 17 00:00:00 2001 From: Max Gautier Date: Tue, 5 Nov 2024 07:11:29 +0100 Subject: [PATCH] etcd: throttle restart for availability (#11677) * etcd: throttle restart for availability During upgrade, etcd member are restarted all at once. This can impact the availability of the etcd cluster and subsequently of the Kubernetes cluster. Limit the concurrent restart so that the etcd cluster can keep quorum. * Simplify etcd handlers --- roles/etcd/handlers/main.yml | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/roles/etcd/handlers/main.yml b/roles/etcd/handlers/main.yml index 62c56dee8..b1123f530 100644 --- a/roles/etcd/handlers/main.yml +++ b/roles/etcd/handlers/main.yml @@ -2,26 +2,25 @@ - name: Backup etcd import_tasks: backup.yml -- name: Etcd | reload systemd +- name: Restart etcd systemd_service: - daemon_reload: true - listen: - - Restart etcd - - Restart etcd-events - -- name: Reload etcd - service: name: etcd state: restarted + daemon_reload: true when: ('etcd' in group_names) - listen: Restart etcd + throttle: "{{ groups['etcd'] | length // 2 }}" + # Etcd cluster MUST have an odd number of members + # Truncated integer division by 2 will always return (majority - 1) which + # means the cluster will keep quorum and stay available -- name: Reload etcd-events - service: +- name: Restart etcd-events + systemd_service: name: etcd-events state: restarted + daemon_reload: true + # TODO: this seems odd. etcd-events should be a different group possibly ? when: ('etcd' in group_names) - listen: Restart etcd-events + throttle: "{{ groups['etcd'] | length // 2 }}" - name: Wait for etcd up uri: