From 1513254622e47eb38a2a888312e19d16c60aa585 Mon Sep 17 00:00:00 2001 From: Farshad Asadpour Date: Thu, 27 Mar 2025 16:40:34 +0330 Subject: [PATCH] fix(remove-node): Ensure safety and validation for node removal process (#12085) This commit enhances the node removal playbook's reliability and safety by implementing the following changes: 1. **Node Validation**: Added a validation step using assert to ensure the `node` variable is defined and contains nodes. If the list is empty or undefined, the playbook fails early, preventing accidental operations on the entire cluster. 2. **Removed Defaulting for Hosts**: Updated tasks to enforce explicit `node` variable input without defaulting to critical groups (e.g., `etcd:k8s_cluster:calico_rr`). By validating `node` beforehand, tasks now solely rely on user-provided input and safely avoid unintended targeting. 3. **Explicit User Confirmation**: Enhanced the confirmation prompt to clarify the scope of the operation. The admin is now required to explicitly confirm node state deletion, ensuring a deliberate decision before proceeding. These improvements strengthen the reliability and safety of the `remove-node.yml` playbook by eliminating ambiguous behavior, preventing misconfigurations, and ensuring clear interaction during node removal tasks. --- docs/getting_started/getting-started.md | 2 ++ playbooks/remove_node.yml | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/docs/getting_started/getting-started.md b/docs/getting_started/getting-started.md index 77fdf244f..18050dc4b 100644 --- a/docs/getting_started/getting-started.md +++ b/docs/getting_started/getting-started.md @@ -59,6 +59,8 @@ ansible-playbook -i inventory/mycluster/hosts.yml remove-node.yml -b -v \ --extra-vars "node=nodename,nodename2" ``` +> Note: The playbook does not currently support the removal of the first control plane or etcd node. These nodes are essential for maintaining cluster operations and must remain intact. + If a node is completely unreachable by ssh, add `--extra-vars reset_nodes=false` to skip the node reset step. If one node is unavailable, but others you wish to remove are able to connect via SSH, you could set `reset_nodes=false` as a host diff --git a/playbooks/remove_node.yml b/playbooks/remove_node.yml index 212bc0f4e..9fa2c550a 100644 --- a/playbooks/remove_node.yml +++ b/playbooks/remove_node.yml @@ -1,9 +1,19 @@ --- +- name: Validate nodes for removal + hosts: localhost + tasks: + - name: Assert that nodes are specified for removal + assert: + that: + - node is defined + - node | length > 0 + msg: "No nodes specified for removal. The `node` variable must be set explicitly." + - name: Common tasks for every playbooks import_playbook: boilerplate.yml - name: Confirm node removal - hosts: "{{ node | default('etcd:k8s_cluster:calico_rr') }}" + hosts: "{{ node | default('this_is_unreachable') }}" gather_facts: false tasks: - name: Confirm Execution @@ -24,7 +34,7 @@ when: reset_nodes | default(True) | bool - name: Reset node - hosts: "{{ node | default('kube_node') }}" + hosts: "{{ node | default('this_is_unreachable') }}" gather_facts: false environment: "{{ proxy_disable_env }}" pre_tasks: @@ -40,7 +50,7 @@ # Currently cannot remove first control plane node or first etcd node - name: Post node removal - hosts: "{{ node | default('kube_control_plane[1:]:etcd[1:]') }}" + hosts: "{{ node | default('this_is_unreachable') }}" gather_facts: false environment: "{{ proxy_disable_env }}" roles: