mirror of
https://github.com/ansible/awx.git
synced 2026-05-17 14:27:42 -02:30
AAP-58539 Move to dispatcherd (#16209)
* WIP First pass * started removing feature flags and adjusting logic * Add decorator * moved to dispatcher decorator * updated as many as I could find * Keep callback receiver working * remove any code that is not used by the call back receiver * add back auto_max_workers * added back get_auto_max_workers into common utils * Remove control and hazmat (squash this not done) * moved status out and deleted control as no longer needed * removed unused imports * adjusted test import to pull correct method * fixed imports and addressed clusternode heartbeat test * Update function comments * Add back hazmat for config and remove baseworker * added back hazmat per @alancoding feedback around config * removed baseworker completely and refactored it into the callback worker * Fix dispatcher run call and remove dispatch setting * remove dispatcher mock publish setting * Adjust heartbeat arg and more formatting * fixed the call to cluster_node_heartbeat missing binder * Fix attribute error in server logs
This commit is contained in:
@@ -9,12 +9,12 @@ import shutil
|
||||
import time
|
||||
from collections import namedtuple
|
||||
from contextlib import redirect_stdout
|
||||
from datetime import datetime
|
||||
from packaging.version import Version
|
||||
from io import StringIO
|
||||
|
||||
# dispatcherd
|
||||
from dispatcherd.factories import get_control_from_settings
|
||||
from dispatcherd.publish import task
|
||||
|
||||
# Runner
|
||||
import ansible_runner.cleanup
|
||||
@@ -56,7 +56,6 @@ from awx.main.analytics.subsystem_metrics import DispatcherMetrics
|
||||
from awx.main.constants import ACTIVE_STATES, ERROR_STATES
|
||||
from awx.main.consumers import emit_channel_notification
|
||||
from awx.main.dispatch import get_task_queuename, reaper
|
||||
from awx.main.dispatch.publish import task as task_awx
|
||||
from awx.main.models import (
|
||||
Instance,
|
||||
InstanceGroup,
|
||||
@@ -74,7 +73,6 @@ from awx.main.tasks.host_indirect import save_indirect_host_entries
|
||||
from awx.main.tasks.receptor import administrative_workunit_reaper, get_receptor_ctl, worker_cleanup, worker_info, write_receptor_config
|
||||
from awx.main.utils.common import ignore_inventory_computed_fields, ignore_inventory_group_removal
|
||||
from awx.main.utils.reload import stop_local_services
|
||||
from dispatcherd.publish import task
|
||||
|
||||
logger = logging.getLogger('awx.main.tasks.system')
|
||||
|
||||
@@ -95,7 +93,10 @@ def _run_dispatch_startup_common():
|
||||
|
||||
# TODO: Enable this on VM installs
|
||||
if settings.IS_K8S:
|
||||
write_receptor_config()
|
||||
try:
|
||||
write_receptor_config()
|
||||
except Exception:
|
||||
logger.exception("Failed to write receptor config, skipping.")
|
||||
|
||||
try:
|
||||
convert_jsonfields()
|
||||
@@ -125,20 +126,12 @@ def _run_dispatch_startup_common():
|
||||
# no-op.
|
||||
#
|
||||
apply_cluster_membership_policies()
|
||||
cluster_node_heartbeat()
|
||||
cluster_node_heartbeat(None)
|
||||
reaper.startup_reaping()
|
||||
m = DispatcherMetrics()
|
||||
m.reset_values()
|
||||
|
||||
|
||||
def _legacy_dispatch_startup():
|
||||
"""
|
||||
Legacy branch for startup: simply performs reaping of waiting jobs with a zero grace period.
|
||||
"""
|
||||
logger.debug("Legacy dispatcher: calling reaper.reap_waiting with grace_period=0")
|
||||
reaper.reap_waiting(grace_period=0)
|
||||
|
||||
|
||||
def _dispatcherd_dispatch_startup():
|
||||
"""
|
||||
New dispatcherd branch for startup: uses the control API to re-submit waiting jobs.
|
||||
@@ -153,21 +146,16 @@ def dispatch_startup():
|
||||
"""
|
||||
System initialization at startup.
|
||||
First, execute the common logic.
|
||||
Then, if FEATURE_DISPATCHERD_ENABLED is enabled, re-submit waiting jobs via the control API;
|
||||
otherwise, fall back to legacy reaping of waiting jobs.
|
||||
Then, re-submit waiting jobs via the control API.
|
||||
"""
|
||||
_run_dispatch_startup_common()
|
||||
if flag_enabled('FEATURE_DISPATCHERD_ENABLED'):
|
||||
_dispatcherd_dispatch_startup()
|
||||
else:
|
||||
_legacy_dispatch_startup()
|
||||
_dispatcherd_dispatch_startup()
|
||||
|
||||
|
||||
def inform_cluster_of_shutdown():
|
||||
"""
|
||||
Clean system shutdown that marks the current instance offline.
|
||||
In legacy mode, it also reaps waiting jobs.
|
||||
In dispatcherd mode, it relies on dispatcherd's built-in cleanup.
|
||||
Relies on dispatcherd's built-in cleanup.
|
||||
"""
|
||||
try:
|
||||
inst = Instance.objects.get(hostname=settings.CLUSTER_HOST_ID)
|
||||
@@ -176,18 +164,11 @@ def inform_cluster_of_shutdown():
|
||||
logger.exception("Cluster host not found: %s", settings.CLUSTER_HOST_ID)
|
||||
return
|
||||
|
||||
if flag_enabled('FEATURE_DISPATCHERD_ENABLED'):
|
||||
logger.debug("Dispatcherd mode: no extra reaping required for instance %s", inst.hostname)
|
||||
else:
|
||||
try:
|
||||
logger.debug("Legacy mode: reaping waiting jobs for instance %s", inst.hostname)
|
||||
reaper.reap_waiting(inst, grace_period=0)
|
||||
except Exception:
|
||||
logger.exception("Failed to reap waiting jobs for %s", inst.hostname)
|
||||
logger.debug("No extra reaping required for instance %s", inst.hostname)
|
||||
logger.warning("Normal shutdown processed for instance %s; instance removed from capacity pool.", inst.hostname)
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=3600 * 5)
|
||||
@task(queue=get_task_queuename, timeout=3600 * 5)
|
||||
def migrate_jsonfield(table, pkfield, columns):
|
||||
batchsize = 10000
|
||||
with advisory_lock(f'json_migration_{table}', wait=False) as acquired:
|
||||
@@ -233,7 +214,7 @@ def migrate_jsonfield(table, pkfield, columns):
|
||||
logger.warning(f"Migration of {table} to jsonb is finished.")
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=3600, on_duplicate='queue_one')
|
||||
@task(queue=get_task_queuename, timeout=3600, on_duplicate='queue_one')
|
||||
def apply_cluster_membership_policies():
|
||||
from awx.main.signals import disable_activity_stream
|
||||
|
||||
@@ -345,7 +326,7 @@ def apply_cluster_membership_policies():
|
||||
logger.debug('Cluster policy computation finished in {} seconds'.format(time.time() - started_compute))
|
||||
|
||||
|
||||
@task_awx(queue='tower_settings_change', timeout=600)
|
||||
@task(queue='tower_settings_change', timeout=600)
|
||||
def clear_setting_cache(setting_keys):
|
||||
# log that cache is being cleared
|
||||
logger.info(f"clear_setting_cache of keys {setting_keys}")
|
||||
@@ -363,7 +344,7 @@ def clear_setting_cache(setting_keys):
|
||||
ctl.control('set_log_level', data={'level': settings.LOG_AGGREGATOR_LEVEL})
|
||||
|
||||
|
||||
@task_awx(queue='tower_broadcast_all', timeout=600)
|
||||
@task(queue='tower_broadcast_all', timeout=600)
|
||||
def delete_project_files(project_path):
|
||||
# TODO: possibly implement some retry logic
|
||||
lock_file = project_path + '.lock'
|
||||
@@ -381,7 +362,7 @@ def delete_project_files(project_path):
|
||||
logger.exception('Could not remove lock file {}'.format(lock_file))
|
||||
|
||||
|
||||
@task_awx(queue='tower_broadcast_all')
|
||||
@task(queue='tower_broadcast_all')
|
||||
def profile_sql(threshold=1, minutes=1):
|
||||
if threshold <= 0:
|
||||
cache.delete('awx-profile-sql-threshold')
|
||||
@@ -391,7 +372,7 @@ def profile_sql(threshold=1, minutes=1):
|
||||
logger.error('SQL QUERIES >={}s ENABLED FOR {} MINUTE(S)'.format(threshold, minutes))
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=1800)
|
||||
@task(queue=get_task_queuename, timeout=1800)
|
||||
def send_notifications(notification_list, job_id=None):
|
||||
if not isinstance(notification_list, list):
|
||||
raise TypeError("notification_list should be of type list")
|
||||
@@ -436,13 +417,13 @@ def events_processed_hook(unified_job):
|
||||
save_indirect_host_entries.delay(unified_job.id)
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=3600 * 5, on_duplicate='discard')
|
||||
@task(queue=get_task_queuename, timeout=3600 * 5, on_duplicate='discard')
|
||||
def gather_analytics():
|
||||
if is_run_threshold_reached(getattr(settings, 'AUTOMATION_ANALYTICS_LAST_GATHER', None), settings.AUTOMATION_ANALYTICS_GATHER_INTERVAL):
|
||||
analytics.gather()
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=600, on_duplicate='queue_one')
|
||||
@task(queue=get_task_queuename, timeout=600, on_duplicate='queue_one')
|
||||
def purge_old_stdout_files():
|
||||
nowtime = time.time()
|
||||
for f in os.listdir(settings.JOBOUTPUT_ROOT):
|
||||
@@ -504,18 +485,18 @@ class CleanupImagesAndFiles:
|
||||
cls.run_remote(this_inst, **kwargs)
|
||||
|
||||
|
||||
@task_awx(queue='tower_broadcast_all', timeout=3600)
|
||||
@task(queue='tower_broadcast_all', timeout=3600)
|
||||
def handle_removed_image(remove_images=None):
|
||||
"""Special broadcast invocation of this method to handle case of deleted EE"""
|
||||
CleanupImagesAndFiles.run(remove_images=remove_images, file_pattern='')
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=3600, on_duplicate='queue_one')
|
||||
@task(queue=get_task_queuename, timeout=3600, on_duplicate='queue_one')
|
||||
def cleanup_images_and_files():
|
||||
CleanupImagesAndFiles.run(image_prune=True)
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=600, on_duplicate='queue_one')
|
||||
@task(queue=get_task_queuename, timeout=600, on_duplicate='queue_one')
|
||||
def execution_node_health_check(node):
|
||||
if node == '':
|
||||
logger.warning('Remote health check incorrectly called with blank string')
|
||||
@@ -640,44 +621,13 @@ def inspect_execution_and_hop_nodes(instance_list):
|
||||
execution_node_health_check.apply_async([hostname])
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, bind_kwargs=['dispatch_time', 'worker_tasks'])
|
||||
def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None):
|
||||
"""
|
||||
Original implementation for AWX dispatcher.
|
||||
Uses worker_tasks from bind_kwargs to track running tasks.
|
||||
"""
|
||||
# Run common instance management logic
|
||||
this_inst, instance_list, lost_instances = _heartbeat_instance_management()
|
||||
if this_inst is None:
|
||||
return # Early return case from instance management
|
||||
|
||||
# Check versions
|
||||
_heartbeat_check_versions(this_inst, instance_list)
|
||||
|
||||
# Handle lost instances
|
||||
_heartbeat_handle_lost_instances(lost_instances, this_inst)
|
||||
|
||||
# Run local reaper - original implementation using worker_tasks
|
||||
if worker_tasks is not None:
|
||||
active_task_ids = []
|
||||
for task_list in worker_tasks.values():
|
||||
active_task_ids.extend(task_list)
|
||||
|
||||
# Convert dispatch_time to datetime
|
||||
ref_time = datetime.fromisoformat(dispatch_time) if dispatch_time else now()
|
||||
|
||||
reaper.reap(instance=this_inst, excluded_uuids=active_task_ids, ref_time=ref_time)
|
||||
|
||||
if max(len(task_list) for task_list in worker_tasks.values()) <= 1:
|
||||
reaper.reap_waiting(instance=this_inst, excluded_uuids=active_task_ids, ref_time=ref_time)
|
||||
|
||||
|
||||
@task(queue=get_task_queuename, bind=True)
|
||||
def adispatch_cluster_node_heartbeat(binder):
|
||||
def cluster_node_heartbeat(binder):
|
||||
"""
|
||||
Dispatcherd implementation.
|
||||
Uses Control API to get running tasks.
|
||||
"""
|
||||
|
||||
# Run common instance management logic
|
||||
this_inst, instance_list, lost_instances = _heartbeat_instance_management()
|
||||
if this_inst is None:
|
||||
@@ -690,6 +640,9 @@ def adispatch_cluster_node_heartbeat(binder):
|
||||
_heartbeat_handle_lost_instances(lost_instances, this_inst)
|
||||
|
||||
# Get running tasks using dispatcherd API
|
||||
if binder is None:
|
||||
logger.debug("Heartbeat finished in startup.")
|
||||
return
|
||||
active_task_ids = _get_active_task_ids_from_dispatcherd(binder)
|
||||
if active_task_ids is None:
|
||||
logger.warning("No active task IDs retrieved from dispatcherd, skipping reaper")
|
||||
@@ -839,7 +792,7 @@ def _heartbeat_handle_lost_instances(lost_instances, this_inst):
|
||||
logger.exception('No SQL state available. Error marking {} as lost'.format(other_inst.hostname))
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=1800, on_duplicate='queue_one')
|
||||
@task(queue=get_task_queuename, timeout=1800, on_duplicate='queue_one')
|
||||
def awx_receptor_workunit_reaper():
|
||||
"""
|
||||
When an AWX job is launched via receptor, files such as status, stdin, and stdout are created
|
||||
@@ -885,7 +838,7 @@ def awx_receptor_workunit_reaper():
|
||||
administrative_workunit_reaper(receptor_work_list)
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=1800, on_duplicate='queue_one')
|
||||
@task(queue=get_task_queuename, timeout=1800, on_duplicate='queue_one')
|
||||
def awx_k8s_reaper():
|
||||
if not settings.RECEPTOR_RELEASE_WORK:
|
||||
return
|
||||
@@ -908,7 +861,7 @@ def awx_k8s_reaper():
|
||||
logger.exception("Failed to delete orphaned pod {} from {}".format(job.log_format, group))
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=3600 * 5, on_duplicate='discard')
|
||||
@task(queue=get_task_queuename, timeout=3600 * 5, on_duplicate='discard')
|
||||
def awx_periodic_scheduler():
|
||||
lock_session_timeout_milliseconds = settings.TASK_MANAGER_LOCK_TIMEOUT * 1000
|
||||
with advisory_lock('awx_periodic_scheduler_lock', lock_session_timeout_milliseconds=lock_session_timeout_milliseconds, wait=False) as acquired:
|
||||
@@ -965,7 +918,7 @@ def awx_periodic_scheduler():
|
||||
emit_channel_notification('schedules-changed', dict(id=schedule.id, group_name="schedules"))
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=3600)
|
||||
@task(queue=get_task_queuename, timeout=3600)
|
||||
def handle_failure_notifications(task_ids):
|
||||
"""A task-ified version of the method that sends notifications."""
|
||||
found_task_ids = set()
|
||||
@@ -980,7 +933,7 @@ def handle_failure_notifications(task_ids):
|
||||
logger.warning(f'Could not send notifications for {deleted_tasks} because they were not found in the database')
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=3600 * 5)
|
||||
@task(queue=get_task_queuename, timeout=3600 * 5)
|
||||
def update_inventory_computed_fields(inventory_id):
|
||||
"""
|
||||
Signal handler and wrapper around inventory.update_computed_fields to
|
||||
@@ -1030,7 +983,7 @@ def update_smart_memberships_for_inventory(smart_inventory):
|
||||
return False
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=3600, on_duplicate='queue_one')
|
||||
@task(queue=get_task_queuename, timeout=3600, on_duplicate='queue_one')
|
||||
def update_host_smart_inventory_memberships():
|
||||
smart_inventories = Inventory.objects.filter(kind='smart', host_filter__isnull=False, pending_deletion=False)
|
||||
changed_inventories = set([])
|
||||
@@ -1046,7 +999,7 @@ def update_host_smart_inventory_memberships():
|
||||
smart_inventory.update_computed_fields()
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=3600 * 5)
|
||||
@task(queue=get_task_queuename, timeout=3600 * 5)
|
||||
def delete_inventory(inventory_id, user_id, retries=5):
|
||||
# Delete inventory as user
|
||||
if user_id is None:
|
||||
@@ -1108,7 +1061,7 @@ def _reconstruct_relationships(copy_mapping):
|
||||
new_obj.save()
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=600)
|
||||
@task(queue=get_task_queuename, timeout=600)
|
||||
def deep_copy_model_obj(model_module, model_name, obj_pk, new_obj_pk, user_pk, permission_check_func=None):
|
||||
logger.debug('Deep copy {} from {} to {}.'.format(model_name, obj_pk, new_obj_pk))
|
||||
|
||||
@@ -1163,7 +1116,7 @@ def deep_copy_model_obj(model_module, model_name, obj_pk, new_obj_pk, user_pk, p
|
||||
update_inventory_computed_fields.delay(new_obj.id)
|
||||
|
||||
|
||||
@task_awx(queue=get_task_queuename, timeout=3600, on_duplicate='discard')
|
||||
@task(queue=get_task_queuename, timeout=3600, on_duplicate='discard')
|
||||
def periodic_resource_sync():
|
||||
if not getattr(settings, 'RESOURCE_SERVER', None):
|
||||
logger.debug("Skipping periodic resource_sync, RESOURCE_SERVER not configured")
|
||||
|
||||
Reference in New Issue
Block a user