mirror of
https://github.com/kubernetes-sigs/kubespray.git
synced 2026-01-29 23:34:45 -03:30
* ignore_unreachable for etcd dir cleanup ignore_errors ignores errors occur within "file" module. However, when the target node is offline, the playbook will still fail at this task with node "unreachable" state. Setting "ignore_unreachable: true" allows the playbook to bypass offline nodes and move on to proceed recovery tasks on remaining online nodes. * Re-arrange control plane recovery runbook steps * Remove suggestion to manually update IP addresses The suggestion was added in 48a182844c9c3438e36c78cbc4518c962e0a9ab2 4 years ago. But a new task added 2 years ago, in ee0f1e9d58ed8bf1fd13ff1eb1527678fe4fa6da, automatically update API server arg with updated etcd node ip addresses. This suggestion is no longer needed.
95 lines
3.0 KiB
YAML
95 lines
3.0 KiB
YAML
---
|
|
- name: Get etcd endpoint health
|
|
command: "{{ bin_dir }}/etcdctl endpoint health"
|
|
register: etcd_endpoint_health
|
|
ignore_errors: true # noqa ignore-errors
|
|
changed_when: false
|
|
check_mode: no
|
|
environment:
|
|
ETCDCTL_API: "3"
|
|
ETCDCTL_ENDPOINTS: "{{ etcd_access_addresses }}"
|
|
ETCDCTL_CERT: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
|
|
ETCDCTL_KEY: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
|
|
ETCDCTL_CACERT: "{{ etcd_cert_dir }}/ca.pem"
|
|
when:
|
|
- groups['broken_etcd']
|
|
|
|
- name: Set healthy fact
|
|
set_fact:
|
|
healthy: "{{ etcd_endpoint_health.stderr is match('Error: unhealthy cluster') }}"
|
|
when:
|
|
- groups['broken_etcd']
|
|
|
|
- name: Set has_quorum fact
|
|
set_fact:
|
|
has_quorum: "{{ etcd_endpoint_health.stdout_lines | select('match', '.*is healthy.*') | list | length >= etcd_endpoint_health.stderr_lines | select('match', '.*is unhealthy.*') | list | length }}"
|
|
when:
|
|
- groups['broken_etcd']
|
|
|
|
- name: Recover lost etcd quorum
|
|
include_tasks: recover_lost_quorum.yml
|
|
when:
|
|
- groups['broken_etcd']
|
|
- not has_quorum
|
|
|
|
- name: Remove etcd data dir
|
|
file:
|
|
path: "{{ etcd_data_dir }}"
|
|
state: absent
|
|
delegate_to: "{{ item }}"
|
|
with_items: "{{ groups['broken_etcd'] }}"
|
|
ignore_errors: true # noqa ignore-errors
|
|
ignore_unreachable: true
|
|
when:
|
|
- groups['broken_etcd']
|
|
- has_quorum
|
|
|
|
- name: Delete old certificates
|
|
shell: "rm {{ etcd_cert_dir }}/*{{ item }}*"
|
|
with_items: "{{ groups['broken_etcd'] }}"
|
|
register: delete_old_cerificates
|
|
ignore_errors: true
|
|
when: groups['broken_etcd']
|
|
|
|
- name: Fail if unable to delete old certificates
|
|
fail:
|
|
msg: "Unable to delete old certificates for: {{ item.item }}"
|
|
loop: "{{ delete_old_cerificates.results }}"
|
|
changed_when: false
|
|
when:
|
|
- groups['broken_etcd']
|
|
- "item.rc != 0 and not 'No such file or directory' in item.stderr"
|
|
|
|
- name: Get etcd cluster members
|
|
command: "{{ bin_dir }}/etcdctl member list"
|
|
register: member_list
|
|
changed_when: false
|
|
check_mode: no
|
|
environment:
|
|
ETCDCTL_API: "3"
|
|
ETCDCTL_ENDPOINTS: "{{ etcd_access_addresses }}"
|
|
ETCDCTL_CERT: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
|
|
ETCDCTL_KEY: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
|
|
ETCDCTL_CACERT: "{{ etcd_cert_dir }}/ca.pem"
|
|
when:
|
|
- groups['broken_etcd']
|
|
- not healthy
|
|
- has_quorum
|
|
|
|
- name: Remove broken cluster members
|
|
command: "{{ bin_dir }}/etcdctl member remove {{ item[1].replace(' ', '').split(',')[0] }}"
|
|
environment:
|
|
ETCDCTL_API: "3"
|
|
ETCDCTL_ENDPOINTS: "{{ etcd_access_addresses }}"
|
|
ETCDCTL_CERT: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}.pem"
|
|
ETCDCTL_KEY: "{{ etcd_cert_dir }}/admin-{{ inventory_hostname }}-key.pem"
|
|
ETCDCTL_CACERT: "{{ etcd_cert_dir }}/ca.pem"
|
|
with_nested:
|
|
- "{{ groups['broken_etcd'] }}"
|
|
- "{{ member_list.stdout_lines }}"
|
|
when:
|
|
- groups['broken_etcd']
|
|
- not healthy
|
|
- has_quorum
|
|
- hostvars[item[0]]['etcd_member_name'] == item[1].replace(' ', '').split(',')[2]
|