awx/awx/main/tasks/host_metrics.py
Lila Yasin de4e707bb2
[Bug] AAP 42572 database deadlock (#15953)
* Demo of sorting hosts live test

* Sort both bulk updates and add batch size to facts bulk update to resolve deadlock issue

* Update tests to expect batch_size to agree with changes

* Add utility method to bulk update and sort hosts and applied that to the appropriate locations

Remove unused imports

Add utility method for sorting bulk updates

Remove try except OperationalError for loop

Remove unused import of django.db.OperationalError

Remove batch size as it is now on the bulk update utility method as 100

Remove batch size here since it is specified in sortedbulkupdate

Add transaction.atomic to have entire transaction is run as a signle transaction before committing to the db

Revert change to bulk update as it's not needed here and just sort instead

Move bulk_sorted utility method into db.py and updated name to not be specific to Hosts

Revise to import bulk_update_sorted.. rather than calling it as an argument

Fix way I'm importing bulk_update_sorted.. Remove unneeded Host import and remove calls to bul_update as args

Rebise calls to bulk_update_sorted.. to include Host in the args

REmove raw_update_hosts method and replace with bulk_update_sorted_by_id in update_hosts

Remove update_hosts function and replace with  bulk_update_sorted_by_id

Update live tests to use bulk_update_sorted_by_id

Fix the fields in bulk_update to agree with test

* Update functional tests to use bulk_update_sorted_by_id since update_hosts has been deleted

Replace update_hosts with bulk_update_sorted_by_id

Remove referenes to update_hosts

Update corresponding fact cachin tests to use bulk_update_sorted_by_id

Remove import of bulk_sorted_update

Add code comment to live test to silence Sonarqube hotspot

* Add comment NOSONAR to get rid of Sonarqube warning since this is just a test and it's not actually a security issue

Get test_finish_job_fact_cache_with_existing_data passing

Get test_finish_job_fact_cache_clear passing

Remove reference to raw_update and replace with new bulk update utility method

Add pytest.mark.django_db to appropriate tests

Corrent which model is called in bulk_update_sorted_by_id

Remove now unused Host import

Point to where bulk_update_sorted_by_id to where that is actually being used

Correct import of bulk_update_sorted_by_id

Revert changes in this file to avoid db calls issue

Remove @pytest.mark.django_db from unit tests

Remove commented out host sorting suggested fix

Fix failing tests test_pre_post_run_hook_facts_deleted_sliced & test_pre_post_run_hook_facts

Remove atomic transaction line, add return, and add docstring

* Fix failing test test_finish_job_fact_cache_clear & test_finish_job_fact_cache_with_existing_data

---------

Co-authored-by: Alan Rominger <arominge@redhat.com>
2025-05-02 17:35:41 -04:00

280 lines
12 KiB
Python

import datetime
from dateutil.relativedelta import relativedelta
import logging
from django.conf import settings
from django.db.models import Count, F
from django.db.models.functions import TruncMonth
from django.utils.timezone import now
from awx.main.dispatch import get_task_queuename
from awx.main.dispatch.publish import task
from awx.main.models.inventory import HostMetric, HostMetricSummaryMonthly
from awx.main.tasks.helpers import is_run_threshold_reached
from awx.conf.license import get_license
from ansible_base.lib.utils.db import advisory_lock
from awx.main.utils.db import bulk_update_sorted_by_id
logger = logging.getLogger('awx.main.tasks.host_metrics')
@task(queue=get_task_queuename)
def cleanup_host_metrics():
if is_run_threshold_reached(getattr(settings, 'CLEANUP_HOST_METRICS_LAST_TS', None), getattr(settings, 'CLEANUP_HOST_METRICS_INTERVAL', 30) * 86400):
logger.info(f"Executing cleanup_host_metrics, last ran at {getattr(settings, 'CLEANUP_HOST_METRICS_LAST_TS', '---')}")
HostMetricTask().cleanup(
soft_threshold=getattr(settings, 'CLEANUP_HOST_METRICS_SOFT_THRESHOLD', 12),
hard_threshold=getattr(settings, 'CLEANUP_HOST_METRICS_HARD_THRESHOLD', 36),
)
logger.info("Finished cleanup_host_metrics")
@task(queue=get_task_queuename)
def host_metric_summary_monthly():
"""Run cleanup host metrics summary monthly task each week"""
if is_run_threshold_reached(getattr(settings, 'HOST_METRIC_SUMMARY_TASK_LAST_TS', None), getattr(settings, 'HOST_METRIC_SUMMARY_TASK_INTERVAL', 7) * 86400):
logger.info(f"Executing host_metric_summary_monthly, last ran at {getattr(settings, 'HOST_METRIC_SUMMARY_TASK_LAST_TS', '---')}")
HostMetricSummaryMonthlyTask().execute()
logger.info("Finished host_metric_summary_monthly")
class HostMetricTask:
"""
This class provides cleanup task for HostMetric model.
There are two modes:
- soft cleanup (updates columns delete, deleted_counter and last_deleted)
- hard cleanup (deletes from the db)
"""
def cleanup(self, soft_threshold=None, hard_threshold=None):
"""
Main entrypoint, runs either soft cleanup, hard cleanup or both
:param soft_threshold: (int)
:param hard_threshold: (int)
"""
if hard_threshold is not None:
self.hard_cleanup(hard_threshold)
if soft_threshold is not None:
self.soft_cleanup(soft_threshold)
settings.CLEANUP_HOST_METRICS_LAST_TS = now()
@staticmethod
def soft_cleanup(threshold=None):
if threshold is None:
threshold = getattr(settings, 'CLEANUP_HOST_METRICS_SOFT_THRESHOLD', 12)
try:
threshold = int(threshold)
except (ValueError, TypeError) as e:
raise type(e)("soft_threshold has to be convertible to number") from e
last_automation_before = now() - relativedelta(months=threshold)
rows = HostMetric.active_objects.filter(last_automation__lt=last_automation_before).update(
deleted=True, deleted_counter=F('deleted_counter') + 1, last_deleted=now()
)
logger.info(f'cleanup_host_metrics: soft-deleted records last automated before {last_automation_before}, affected rows: {rows}')
@staticmethod
def hard_cleanup(threshold=None):
if threshold is None:
threshold = getattr(settings, 'CLEANUP_HOST_METRICS_HARD_THRESHOLD', 36)
try:
threshold = int(threshold)
except (ValueError, TypeError) as e:
raise type(e)("hard_threshold has to be convertible to number") from e
last_deleted_before = now() - relativedelta(months=threshold)
queryset = HostMetric.objects.filter(deleted=True, last_deleted__lt=last_deleted_before)
rows = queryset.delete()
logger.info(f'cleanup_host_metrics: hard-deleted records which were soft deleted before {last_deleted_before}, affected rows: {rows[0]}')
class HostMetricSummaryMonthlyTask:
LOCK_KEY = 'host_metric_summary_monthly'
LOCK_SESSION_TIMEOUT = 300000 # 5 minutes.
"""
Task runs every four hours, longer lock timeout avoids premature termination due to high db load or other latency.
This task computes last [threshold] months of HostMetricSummaryMonthly table
[threshold] is setting CLEANUP_HOST_METRICS_HARD_THRESHOLD
Each record in the table represents changes in HostMetric table in one month
It always overrides all the months newer than <threshold>, never updates older months
Algorithm:
- hosts_added are HostMetric records with first_automation in given month
- hosts_deleted are HostMetric records with deleted=True and last_deleted in given month
- - HostMetrics soft-deleted before <threshold> also increases hosts_deleted in their last_deleted month
- license_consumed is license_consumed(previous month) + hosts_added - hosts_deleted
- - license_consumed for HostMetricSummaryMonthly.date < [threshold] is computed also from
all HostMetrics.first_automation < [threshold]
- license_capacity is set only for current month, and it's never updated (value taken from current subscription)
"""
def __init__(self):
self.host_metrics = {}
self.processed_month = self._get_first_month()
self.existing_summaries = None
self.existing_summaries_idx = 0
self.existing_summaries_cnt = 0
self.records_to_create = []
self.records_to_update = []
def execute(self):
with advisory_lock(
HostMetricSummaryMonthlyTask.LOCK_KEY, lock_session_timeout_milliseconds=HostMetricSummaryMonthlyTask.LOCK_SESSION_TIMEOUT, wait=False
) as acquired:
if not acquired:
logger.info("Another instance of host_metric_summary_monthly is already running. Exiting.")
return
self._load_existing_summaries()
self._load_hosts_added()
self._load_hosts_deleted()
# Get first month after last hard delete
month = self._get_first_month()
license_consumed = self._get_license_consumed_before(month)
# Fill record for each month
while month <= datetime.date.today().replace(day=1):
summary = self._find_or_create_summary(month)
# Update summary and update license_consumed by hosts added/removed this month
self._update_summary(summary, month, license_consumed)
license_consumed = summary.license_consumed
month = month + relativedelta(months=1)
# Create/Update stats
HostMetricSummaryMonthly.objects.bulk_create(self.records_to_create)
bulk_update_sorted_by_id(HostMetricSummaryMonthly, self.records_to_update, ['license_consumed', 'hosts_added', 'hosts_deleted'])
# Set timestamp of last run
settings.HOST_METRIC_SUMMARY_TASK_LAST_TS = now()
def _get_license_consumed_before(self, month):
license_consumed = 0
for metric_month, metric in self.host_metrics.items():
if metric_month < month:
hosts_added = metric.get('hosts_added', 0)
hosts_deleted = metric.get('hosts_deleted', 0)
license_consumed = license_consumed + hosts_added - hosts_deleted
else:
break
return license_consumed
def _load_existing_summaries(self):
"""Find all summaries newer than host metrics delete threshold"""
self.existing_summaries = HostMetricSummaryMonthly.objects.filter(date__gte=self._get_first_month()).order_by('date')
self.existing_summaries_idx = 0
self.existing_summaries_cnt = len(self.existing_summaries)
def _load_hosts_added(self):
"""Aggregates hosts added each month, by the 'first_automation' timestamp"""
#
# -- SQL translation (for better code readability)
# SELECT date_trunc('month', first_automation) as month,
# count(first_automation) AS hosts_added
# FROM main_hostmetric
# GROUP BY month
# ORDER by month;
result = (
HostMetric.objects.annotate(month=TruncMonth('first_automation'))
.values('month')
.annotate(hosts_added=Count('first_automation'))
.values('month', 'hosts_added')
.order_by('month')
)
for host_metric in list(result):
month = host_metric['month']
if month:
beginning_of_month = datetime.date(month.year, month.month, 1)
if self.host_metrics.get(beginning_of_month) is None:
self.host_metrics[beginning_of_month] = {}
self.host_metrics[beginning_of_month]['hosts_added'] = host_metric['hosts_added']
def _load_hosts_deleted(self):
"""
Aggregates hosts deleted each month, by the 'last_deleted' timestamp.
Host metrics have to be deleted NOW to be counted as deleted before
(by intention - statistics can change retrospectively by re-automation of previously deleted host)
"""
#
# -- SQL translation (for better code readability)
# SELECT date_trunc('month', last_deleted) as month,
# count(last_deleted) AS hosts_deleted
# FROM main_hostmetric
# WHERE deleted = True
# GROUP BY 1 # equal to "GROUP BY month"
# ORDER by month;
result = (
HostMetric.objects.annotate(month=TruncMonth('last_deleted'))
.values('month')
.annotate(hosts_deleted=Count('last_deleted'))
.values('month', 'hosts_deleted')
.filter(deleted=True)
.order_by('month')
)
for host_metric in list(result):
month = host_metric['month']
if month:
beginning_of_month = datetime.date(month.year, month.month, 1)
if self.host_metrics.get(beginning_of_month) is None:
self.host_metrics[beginning_of_month] = {}
self.host_metrics[beginning_of_month]['hosts_deleted'] = host_metric['hosts_deleted']
def _find_or_create_summary(self, month):
summary = self._find_summary(month)
if not summary:
summary = HostMetricSummaryMonthly(date=month)
self.records_to_create.append(summary)
else:
self.records_to_update.append(summary)
return summary
def _find_summary(self, month: datetime.date):
"""
Existing summaries are ordered by month ASC.
This method is called with month in ascending order too => only 1 traversing is enough
"""
summary = None
while not summary and self.existing_summaries_idx < self.existing_summaries_cnt:
tmp = self.existing_summaries[self.existing_summaries_idx]
if isinstance(tmp, datetime.datetime):
tmp = tmp.date()
if tmp.date < month:
self.existing_summaries_idx += 1
elif tmp.date == month:
summary = tmp
elif tmp.date > month:
break
return summary
def _update_summary(self, summary, month, license_consumed):
"""Updates the metric with hosts added and deleted and set license info for current month"""
# Get month counts from host metrics, zero if not found
hosts_added, hosts_deleted = 0, 0
if metric := self.host_metrics.get(month, None):
hosts_added = metric.get('hosts_added', 0)
hosts_deleted = metric.get('hosts_deleted', 0)
summary.license_consumed = license_consumed + hosts_added - hosts_deleted
summary.hosts_added = hosts_added
summary.hosts_deleted = hosts_deleted
# Set subscription count for current month
if month == datetime.date.today().replace(day=1):
license_info = get_license()
summary.license_capacity = license_info.get('instance_count', 0)
return summary
@staticmethod
def _get_first_month():
"""Returns first month after host metrics hard delete threshold"""
threshold = getattr(settings, 'CLEANUP_HOST_METRICS_HARD_THRESHOLD', 36)
return datetime.date.today().replace(day=1) - relativedelta(months=int(threshold) - 1)