Browse Source

Etcd cluster setup makeover

The current way to setup the etc cluster is messy and buggy.

- It checks for cluster is healthy before the cluster is even created.
- The unit files are started on handlers, not in the task, so you mess with "flush handlers".
- The join_member.yml is not used.
- etcd events cluster is not configured for kubeadm
- remove duplicate runs between running the role on etcd nodes and k8s nodes
pull/2577/head
woopstar 6 years ago
committed by Andreas Kruger
parent
commit
86e3506ae6
15 changed files with 129 additions and 166 deletions
  1. 4
      cluster.yml
  2. 1
      roles/etcd/defaults/main.yml
  3. 5
      roles/etcd/handlers/main.yml
  4. 115
      roles/etcd/tasks/configure.yml
  5. 17
      roles/etcd/tasks/gen_certs_script.yml
  6. 8
      roles/etcd/tasks/install_docker.yml
  7. 1
      roles/etcd/tasks/install_host.yml
  8. 2
      roles/etcd/tasks/install_rkt.yml
  9. 13
      roles/etcd/tasks/join_etcd-events_member.yml
  10. 15
      roles/etcd/tasks/join_etcd_member.yml
  11. 47
      roles/etcd/tasks/join_member.yml
  12. 36
      roles/etcd/tasks/main.yml
  13. 2
      roles/etcd/tasks/refresh_config.yml
  14. 26
      roles/etcd/tasks/set_cluster_health.yml
  15. 3
      roles/kubernetes/master/templates/kubeadm-config.yaml.j2

4
cluster.yml

@ -51,13 +51,13 @@
any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
roles:
- { role: kubespray-defaults}
- { role: etcd, tags: etcd, etcd_cluster_setup: true }
- { role: etcd, tags: etcd }
- hosts: k8s-cluster:calico-rr
any_errors_fatal: "{{ any_errors_fatal | default(true) }}"
roles:
- { role: kubespray-defaults}
- { role: etcd, tags: etcd, etcd_cluster_setup: false }
- { role: etcd, tags: etcd }
- hosts: etcd:k8s-cluster:vault:calico-rr
any_errors_fatal: "{{ any_errors_fatal | default(true) }}"

1
roles/etcd/defaults/main.yml

@ -1,6 +1,7 @@
---
# Set to false to only do certificate management
etcd_cluster_setup: true
etcd_events_cluster_setup: false
etcd_backup_prefix: "/var/backups"
etcd_data_dir: "/var/lib/etcd"

5
roles/etcd/handlers/main.yml

@ -10,7 +10,7 @@
- name: restart etcd-events
command: /bin/true
notify:
- etcd-events | reload systemd
- etcd | reload systemd
- reload etcd-events
- wait for etcd-events up
@ -19,9 +19,6 @@
- name: etcd | reload systemd
command: systemctl daemon-reload
- name: etcd-events | reload systemd
command: systemctl daemon-reload
- name: reload etcd
service:
name: etcd

115
roles/etcd/tasks/configure.yml

@ -1,20 +1,20 @@
---
- name: Configure | Check if member is in etcd cluster
shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_access_addresses }} member list | grep -q {{ etcd_access_address }}"
register: etcd_member_in_cluster
- name: Configure | Check if etcd cluster is healthy
shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'"
register: etcd_cluster_is_healthy
ignore_errors: true
changed_when: false
check_mode: no
when: is_etcd_master
when: is_etcd_master and etcd_cluster_setup
tags:
- facts
environment:
ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
- name: Configure | Check if member is in etcd-events cluster
shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_events_access_addresses }} member list | grep -q {{ etcd_access_address }}"
register: etcd_events_member_in_cluster
- name: Configure | Check if etcd-events cluster is healthy
shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_events_access_addresses }} cluster-health | grep -q 'cluster is healthy'"
register: etcd_events_cluster_is_healthy
ignore_errors: true
changed_when: false
check_mode: no
@ -25,44 +25,109 @@
ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
- include_tasks: refresh_config.yml
when: is_etcd_master
- name: Configure | Copy etcd.service systemd file
template:
src: "etcd-{{ etcd_deployment_type }}.service.j2"
dest: /etc/systemd/system/etcd.service
backup: yes
when: is_etcd_master
notify: restart etcd
when: is_etcd_master and etcd_cluster_setup
- name: Configure | Copy etcd-events.service systemd file
template:
src: "etcd-events-host.service.j2"
src: "etcd-events-{{ etcd_deployment_type }}.service.j2"
dest: /etc/systemd/system/etcd-events.service
backup: yes
when: is_etcd_master and etcd_deployment_type == "host" and etcd_events_cluster_setup
notify: restart etcd-events
when: is_etcd_master and etcd_events_cluster_setup
- name: Configure | Copy etcd-events.service systemd file
template:
src: "etcd-events-docker.service.j2"
dest: /etc/systemd/system/etcd-events.service
backup: yes
when: is_etcd_master and etcd_deployment_type == "docker" and etcd_events_cluster_setup
notify: restart etcd-events
- name: Configure | reload systemd
command: systemctl daemon-reload
when: is_etcd_master
- name: Configure | Ensure etcd is running
service:
name: etcd
state: started
enabled: yes
when: is_etcd_master and etcd_cluster_setup
- name: Configure | Ensure etcd-events is running
service:
name: etcd-events
state: started
enabled: yes
when: is_etcd_master and etcd_events_cluster_setup
- name: Configure | Check if etcd cluster is healthy
shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'"
register: etcd_cluster_is_healthy
until: etcd_cluster_is_healthy.rc == 0
retries: 4
delay: "{{ retry_stagger | random + 3 }}"
ignore_errors: false
changed_when: false
check_mode: no
when: is_etcd_master and etcd_cluster_setup
tags:
- facts
environment:
ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
- name: Configure | Check if etcd-events cluster is healthy
shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_events_access_addresses }} cluster-health | grep -q 'cluster is healthy'"
register: etcd_events_cluster_is_healthy
until: etcd_events_cluster_is_healthy.rc == 0
retries: 4
delay: "{{ retry_stagger | random + 3 }}"
ignore_errors: false
changed_when: false
check_mode: no
when: is_etcd_master and etcd_events_cluster_setup
tags:
- facts
environment:
ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
- name: Configure | Check if member is in etcd cluster
shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_access_addresses }} member list | grep -q {{ etcd_access_address }}"
register: etcd_member_in_cluster
ignore_errors: true
changed_when: false
check_mode: no
when: is_etcd_master and etcd_cluster_setup
tags:
- facts
environment:
ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
- name: Configure | Check if member is in etcd-events cluster
shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_events_access_addresses }} member list | grep -q {{ etcd_access_address }}"
register: etcd_events_member_in_cluster
ignore_errors: true
changed_when: false
check_mode: no
when: is_etcd_master and etcd_events_cluster_setup
tags:
- facts
environment:
ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
- name: Configure | Join member(s) to etcd cluster one at a time
include_tasks: join_etcd_member.yml
vars:
target_node: "{{ item }}"
loop_control:
pause: 10
with_items: "{{ groups['etcd'] }}"
when: inventory_hostname == item and etcd_member_in_cluster.rc != 0 and etcd_cluster_is_healthy.rc == 0
when: inventory_hostname == item and etcd_cluster_setup and etcd_member_in_cluster.rc != 0 and etcd_cluster_is_healthy.rc == 0
- name: Configure | Join member(s) to etcd-events cluster one at a time
include_tasks: join_etcd-evetns_member.yml
include_tasks: join_etcd-events_member.yml
vars:
target_node: "{{ item }}"
loop_control:
pause: 10
with_items: "{{ groups['etcd'] }}"
when: inventory_hostname == item and etcd_events_cluster_setup and etcd_events_member_in_cluster.rc != 0 and etcd_events_cluster_is_healthy.rc == 0

17
roles/etcd/tasks/gen_certs_script.yml

@ -15,6 +15,7 @@
owner: root
mode: 0700
run_once: yes
when: inventory_hostname == groups['etcd'][0]
delegate_to: "{{groups['etcd'][0]}}"
- name: "Gen_certs | create etcd cert dir (on {{groups['etcd'][0]}})"
@ -26,6 +27,7 @@
recurse: yes
mode: 0700
run_once: yes
when: inventory_hostname == groups['etcd'][0]
delegate_to: "{{groups['etcd'][0]}}"
- name: Gen_certs | write openssl config
@ -34,7 +36,9 @@
dest: "{{ etcd_config_dir }}/openssl.conf"
run_once: yes
delegate_to: "{{groups['etcd'][0]}}"
when: gen_certs|default(false)
when:
- gen_certs|default(false)
- inventory_hostname == groups['etcd'][0]
- name: Gen_certs | copy certs generation script
copy:
@ -43,8 +47,9 @@
mode: 0700
run_once: yes
delegate_to: "{{groups['etcd'][0]}}"
when: gen_certs|default(false)
when:
- gen_certs|default(false)
- inventory_hostname == groups['etcd'][0]
- name: Gen_certs | run cert generation script
command: "bash -x {{ etcd_script_dir }}/make-ssl-etcd.sh -f {{ etcd_config_dir }}/openssl.conf -d {{ etcd_cert_dir }}"
@ -61,7 +66,9 @@
{% endfor %}"
run_once: yes
delegate_to: "{{groups['etcd'][0]}}"
when: gen_certs|default(false)
when:
- gen_certs|default(false)
- inventory_hostname == groups['etcd'][0]
notify: set etcd_secret_changed
- set_fact:
@ -160,5 +167,5 @@
group: "{{ etcd_cert_group }}"
state: directory
owner: kube
mode: "u=rwX,g-rwx,o-rwx"
mode: "640"
recurse: yes

8
roles/etcd/tasks/install_docker.yml

@ -9,22 +9,22 @@
retries: 4
delay: "{{ retry_stagger | random + 3 }}"
changed_when: false
when: etcd_cluster_setup
- name: Install etcd launch script
template:
src: etcd.j2
dest: "{{ bin_dir }}/etcd"
owner: 'root'
mode: 0755
mode: 0750
backup: yes
notify: restart etcd
when: etcd_cluster_setup
- name: Install etcd-events launch script
template:
src: etcd-events.j2
dest: "{{ bin_dir }}/etcd-events"
owner: 'root'
mode: 0755
mode: 0750
backup: yes
when: etcd_events_cluster_setup
notify: restart etcd-events

1
roles/etcd/tasks/install_host.yml

@ -10,3 +10,4 @@
retries: 4
delay: "{{ retry_stagger | random + 3 }}"
changed_when: false
when: etcd_cluster_setup

2
roles/etcd/tasks/install_rkt.yml

@ -11,6 +11,7 @@
delay: "{{ retry_stagger | random + 3 }}"
changed_when: false
environment: "{{proxy_env}}"
when: etcd_cluster_setup
- name: Install | Copy etcdctl binary from rkt container
command: >-
@ -26,3 +27,4 @@
delay: "{{ retry_stagger | random + 3 }}"
changed_when: false
environment: "{{proxy_env}}"
when: etcd_cluster_setup

13
roles/etcd/tasks/join_etcd-events_member.yml

@ -1,5 +1,5 @@
---
- name: Join Member | Add member to cluster
- name: Join Member | Add member to etcd-events cluster
shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_events_access_addresses }} member add {{ etcd_member_name }} {{ etcd_events_peer_url }}"
register: member_add_result
until: member_add_result.rc == 0
@ -23,17 +23,6 @@
{%- endfor -%}
when: target_node == inventory_hostname
- name: Join Member | reload systemd
command: systemctl daemon-reload
when: target_node == inventory_hostname
- name: Join Member | Ensure etcd-events is running
service:
name: etcd-events
state: started
enabled: yes
when: target_node == inventory_hostname
- name: Join Member | Ensure member is in etcd-events cluster
shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_events_access_addresses }} member list | grep -q {{ etcd_events_access_address }}"
register: etcd_events_member_in_cluster

15
roles/etcd/tasks/join_etcd_member.yml

@ -1,5 +1,5 @@
---
- name: Join Member | Add member to cluster
- name: Join Member | Add member to etcd cluster
shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} member add {{ etcd_member_name }} {{ etcd_peer_url }}"
register: member_add_result
until: member_add_result.rc == 0
@ -23,18 +23,7 @@
{%- endfor -%}
when: target_node == inventory_hostname
- name: Join Member | reload systemd
command: systemctl daemon-reload
when: target_node == inventory_hostname
- name: Join Member | Ensure etcd is running
service:
name: etcd
state: started
enabled: yes
when: target_node == inventory_hostname
- name: Join Member | Ensure member is in cluster
- name: Join Member | Ensure member is in etcd cluster
shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_access_addresses }} member list | grep -q {{ etcd_access_address }}"
register: etcd_member_in_cluster
changed_when: false

47
roles/etcd/tasks/join_member.yml

@ -1,47 +0,0 @@
---
- name: Join Member | Add member to cluster
shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} member add {{ etcd_member_name }} {{ etcd_peer_url }}"
register: member_add_result
until: member_add_result.rc == 0
retries: 4
delay: "{{ retry_stagger | random + 3 }}"
when: target_node == inventory_hostname
environment:
ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
- include_tasks: refresh_config.yml
vars:
etcd_peer_addresses: >-
{% for host in groups['etcd'] -%}
{%- if hostvars[host]['etcd_member_in_cluster'].rc == 0 -%}
{{ "etcd"+loop.index|string }}=https://{{ hostvars[host].access_ip | default(hostvars[host].ip | default(hostvars[host].ansible_default_ipv4['address'])) }}:2380,
{%- endif -%}
{%- if loop.last -%}
{{ etcd_member_name }}={{ etcd_peer_url }}
{%- endif -%}
{%- endfor -%}
when: target_node == inventory_hostname
- name: Join Member | reload systemd
command: systemctl daemon-reload
when: target_node == inventory_hostname
- name: Join Member | Ensure etcd is running
service:
name: etcd
state: started
enabled: yes
when: target_node == inventory_hostname
- name: Join Member | Ensure member is in cluster
shell: "{{ bin_dir }}/etcdctl --no-sync --endpoints={{ etcd_access_addresses }} member list | grep -q {{ etcd_access_address }}"
register: etcd_member_in_cluster
changed_when: false
check_mode: no
tags:
- facts
when: target_node == inventory_hostname
environment:
ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"

36
roles/etcd/tasks/main.yml

@ -6,6 +6,7 @@
- facts
- include_tasks: "gen_certs_{{ cert_management }}.yml"
when:
tags:
- etcd-secrets
@ -29,47 +30,28 @@
tags:
- upgrade
- include_tasks: set_cluster_health.yml
when: is_etcd_master and etcd_cluster_setup
- include_tasks: configure.yml
when: is_etcd_master and etcd_cluster_setup
when: is_etcd_master
- include_tasks: refresh_config.yml
when: is_etcd_master and etcd_cluster_setup
when: is_etcd_master
- name: Restart etcd if certs changed
command: /bin/true
notify: restart etcd
when: is_etcd_master and etcd_secret_changed|default(false)
- name: Restart etcd-events if certs changed
command: /bin/true
notify: restart etcd
when: is_etcd_master and etcd_events_cluster_setup and etcd_secret_changed|default(false)
# reload-systemd
- meta: flush_handlers
- name: Ensure etcd is running
service:
name: etcd
state: started
state: restarted
enabled: yes
when: is_etcd_master and etcd_cluster_setup
when: is_etcd_master and etcd_cluster_setup and etcd_secret_changed|default(false)
- name: Ensure etcd-events is running
- name: Restart etcd-events if certs changed
service:
name: etcd-events
state: started
state: restarted
enabled: yes
when: is_etcd_master and etcd_events_cluster_setup
when: is_etcd_master and etcd_events_cluster_setup and etcd_secret_changed|default(false)
# After etcd cluster is assembled, make sure that
# initial state of the cluster is in `existing`
# state insted of `new`.
- include_tasks: set_cluster_health.yml
when: is_etcd_master and etcd_cluster_setup
- include_tasks: refresh_config.yml
when: is_etcd_master and etcd_cluster_setup
when: is_etcd_master

2
roles/etcd/tasks/refresh_config.yml

@ -4,7 +4,7 @@
src: etcd.env.j2
dest: /etc/etcd.env
notify: restart etcd
when: is_etcd_master
when: is_etcd_master and etcd_cluster_setup
- name: Refresh config | Create etcd-events config file
template:

26
roles/etcd/tasks/set_cluster_health.yml

@ -1,26 +0,0 @@
---
- name: Configure | Check if etcd cluster is healthy
shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_access_addresses }} cluster-health | grep -q 'cluster is healthy'"
register: etcd_cluster_is_healthy
ignore_errors: true
changed_when: false
check_mode: no
when: is_etcd_master
tags:
- facts
environment:
ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
- name: Configure | Check if etcd-events cluster is healthy
shell: "{{ bin_dir }}/etcdctl --endpoints={{ etcd_events_access_addresses }} cluster-health | grep -q 'cluster is healthy'"
register: etcd_events_cluster_is_healthy
ignore_errors: true
changed_when: false
check_mode: no
when: is_etcd_master and etcd_events_cluster_setup
tags:
- facts
environment:
ETCDCTL_CERT_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
ETCDCTL_KEY_FILE: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"

3
roles/kubernetes/master/templates/kubeadm-config.yaml.j2

@ -38,6 +38,9 @@ apiServerExtraArgs:
apiserver-count: "{{ kube_apiserver_count }}"
{% if kube_version | version_compare('v1.9', '>=') %}
endpoint-reconciler-type: lease
{% endif %}
{% if etcd_events_cluster_setup %}
etcd-servers-overrides: "/events#{{ etcd_events_access_addresses }}"
{% endif %}
service-node-port-range: {{ kube_apiserver_node_port_range }}
kubelet-preferred-address-types: "{{ kubelet_preferred_address_types }}"

Loading…
Cancel
Save