From e94bd128b8f58c856b8bea669fa2bc9bca8e8b88 Mon Sep 17 00:00:00 2001 From: chris meyers Date: Thu, 1 Mar 2018 11:21:54 -0500 Subject: [PATCH 1/2] reap all nodes that havn't checked in * Before this change we would exclude the reaping of new nodes. With this change, new nodes will be considered for reaping just like old nodes. --- awx/main/tasks.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 7bf41c515d..290f8f4acd 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -342,8 +342,6 @@ def cluster_node_heartbeat(self): stop_local_services(['uwsgi', 'celery', 'beat', 'callback'], communicate=False) raise RuntimeError("Shutting down.") for other_inst in lost_instances: - if other_inst.capacity == 0: - continue try: other_inst.capacity = 0 other_inst.save(update_fields=['capacity']) From 17de084d04a2b57a77329e25b7a9f1ec107e09c3 Mon Sep 17 00:00:00 2001 From: chris meyers Date: Thu, 1 Mar 2018 17:17:50 -0500 Subject: [PATCH 2/2] perform the min needed DB ops to offline a node * Don't do an extra save to the DB that could conflict with another heartbeat when it isn't needed since we will be deleting the node anyway. --- awx/main/tasks.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 290f8f4acd..026a91ef17 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -343,11 +343,19 @@ def cluster_node_heartbeat(self): raise RuntimeError("Shutting down.") for other_inst in lost_instances: try: - other_inst.capacity = 0 - other_inst.save(update_fields=['capacity']) - logger.error("Host {} last checked in at {}, marked as lost.".format( - other_inst.hostname, other_inst.modified)) - if settings.AWX_AUTO_DEPROVISION_INSTANCES: + # Capacity could already be 0 because: + # * It's a new node and it never had a heartbeat + # * It was set to 0 by another tower node running this method + # * It was set to 0 by this node, but auto deprovisioning is off + # + # If auto deprovisining is on, don't bother setting the capacity to 0 + # since we will delete the node anyway. + if other_inst.capacity != 0 and not settings.AWX_AUTO_DEPROVISION_INSTANCES: + other_inst.capacity = 0 + other_inst.save(update_fields=['capacity']) + logger.error("Host {} last checked in at {}, marked as lost.".format( + other_inst.hostname, other_inst.modified)) + elif settings.AWX_AUTO_DEPROVISION_INSTANCES: deprovision_hostname = other_inst.hostname other_inst.delete() logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname))