Fix notification timing issue by sending in the latter of 2 events (#12110)

* Track host_status_counts and use that to process notifications

* Remove now unused setting

* Back out changes to callback class not needed after all

* Skirt the need for duck typing by leaning on the cached field

* Delete tests for deleted task

* Revert "Back out changes to callback class not needed after all"

This reverts commit 3b8ae350d218991d42bffd65ce4baac6f41926b2.

* Directly hardcode stats_event_type for callback class

* Fire notifications if stats event was never sent

* Remove test content for deleted methods

* Add placeholder for when no hosts matched

* Make field default be None, denote events processed with empty dict

* Make UI process null value for host_status_counts

* Fix tracking of EOF dispatch for system jobs

* Reorganize EVENT_MAP into class properties

* Consolidate conditional I missed from EVENT_MAP refactor

* Give up on the null condition, also applies for empty hosts

* Remove cls position argument not being used

* Move wrapup method out of class, add tests
This commit is contained in:
Alan Rominger
2022-04-29 13:54:31 -04:00
committed by GitHub
parent 41b0607d7e
commit 29d60844a8
15 changed files with 139 additions and 181 deletions

View File

@@ -7,7 +7,7 @@ import traceback
from django.conf import settings
from django.utils.timezone import now as tz_now
from django.db import DatabaseError, OperationalError, connection as django_connection
from django.db import DatabaseError, OperationalError, transaction, connection as django_connection
from django.db.utils import InterfaceError, InternalError
from django_guid import set_guid
@@ -16,8 +16,8 @@ import psutil
import redis
from awx.main.consumers import emit_channel_notification
from awx.main.models import JobEvent, AdHocCommandEvent, ProjectUpdateEvent, InventoryUpdateEvent, SystemJobEvent, UnifiedJob, Job
from awx.main.tasks.system import handle_success_and_failure_notifications
from awx.main.models import JobEvent, AdHocCommandEvent, ProjectUpdateEvent, InventoryUpdateEvent, SystemJobEvent, UnifiedJob
from awx.main.constants import ACTIVE_STATES
from awx.main.models.events import emit_event_detail
from awx.main.utils.profiling import AWXProfiler
import awx.main.analytics.subsystem_metrics as s_metrics
@@ -26,6 +26,32 @@ from .base import BaseWorker
logger = logging.getLogger('awx.main.commands.run_callback_receiver')
def job_stats_wrapup(job_identifier, event=None):
"""Fill in the unified job host_status_counts, fire off notifications if needed"""
try:
# empty dict (versus default of None) can still indicate that events have been processed
# for job types like system jobs, and jobs with no hosts matched
host_status_counts = {}
if event:
host_status_counts = event.get_host_status_counts()
# Update host_status_counts while holding the row lock
with transaction.atomic():
uj = UnifiedJob.objects.select_for_update().get(pk=job_identifier)
uj.host_status_counts = host_status_counts
uj.save(update_fields=['host_status_counts'])
uj.log_lifecycle("stats_wrapup_finished")
# If the status was a finished state before this update was made, send notifications
# If not, we will send notifications when the status changes
if uj.status not in ACTIVE_STATES:
uj.send_notification_templates('succeeded' if uj.status == 'successful' else 'failed')
except Exception:
logger.exception('Worker failed to save stats or emit notifications: Job {}'.format(job_identifier))
class CallbackBrokerWorker(BaseWorker):
"""
A worker implementation that deserializes callback event data and persists
@@ -146,6 +172,8 @@ class CallbackBrokerWorker(BaseWorker):
if not getattr(e, '_skip_websocket_message', False):
metrics_events_broadcast += 1
emit_event_detail(e)
if getattr(e, '_notification_trigger_event', False):
job_stats_wrapup(getattr(e, e.JOB_REFERENCE), event=e)
self.buff = {}
self.last_flush = time.time()
# only update metrics if we saved events
@@ -165,47 +193,32 @@ class CallbackBrokerWorker(BaseWorker):
if flush:
self.last_event = ''
if not flush:
event_map = {
'job_id': JobEvent,
'ad_hoc_command_id': AdHocCommandEvent,
'project_update_id': ProjectUpdateEvent,
'inventory_update_id': InventoryUpdateEvent,
'system_job_id': SystemJobEvent,
}
job_identifier = 'unknown job'
for key, cls in event_map.items():
if key in body:
job_identifier = body[key]
for cls in (JobEvent, AdHocCommandEvent, ProjectUpdateEvent, InventoryUpdateEvent, SystemJobEvent):
if cls.JOB_REFERENCE in body:
job_identifier = body[cls.JOB_REFERENCE]
break
self.last_event = f'\n\t- {cls.__name__} for #{job_identifier} ({body.get("event", "")} {body.get("uuid", "")})' # noqa
notification_trigger_event = bool(body.get('event') == cls.WRAPUP_EVENT)
if body.get('event') == 'EOF':
try:
if 'guid' in body:
set_guid(body['guid'])
final_counter = body.get('final_counter', 0)
logger.info('Event processing is finished for Job {}, sending notifications'.format(job_identifier))
logger.info('Starting EOF event processing for Job {}'.format(job_identifier))
# EOF events are sent when stdout for the running task is
# closed. don't actually persist them to the database; we
# just use them to report `summary` websocket events as an
# approximation for when a job is "done"
emit_channel_notification('jobs-summary', dict(group_name='jobs', unified_job_id=job_identifier, final_counter=final_counter))
# Additionally, when we've processed all events, we should
# have all the data we need to send out success/failure
# notification templates
uj = UnifiedJob.objects.get(pk=job_identifier)
if isinstance(uj, Job):
# *actual playbooks* send their success/failure
# notifications in response to the playbook_on_stats
# event handling code in main.models.events
pass
elif hasattr(uj, 'send_notification_templates'):
handle_success_and_failure_notifications.apply_async([uj.id])
if notification_trigger_event:
job_stats_wrapup(job_identifier)
except Exception:
logger.exception('Worker failed to emit notifications: Job {}'.format(job_identifier))
logger.exception('Worker failed to perform EOF tasks: Job {}'.format(job_identifier))
finally:
self.subsystem_metrics.inc('callback_receiver_events_in_memory', -1)
set_guid('')
@@ -215,9 +228,12 @@ class CallbackBrokerWorker(BaseWorker):
event = cls.create_from_data(**body)
if skip_websocket_message:
if skip_websocket_message: # if this event sends websocket messages, fire them off on flush
event._skip_websocket_message = True
if notification_trigger_event: # if this is an Ansible stats event, ensure notifications on flush
event._notification_trigger_event = True
self.buff.setdefault(cls, []).append(event)
retries = 0