Model changes for instance last_seen field to replace modified (#10870)

* Model changes for instance last_seen field to replace modified

* Break up refresh_capacity into smaller units

* Rename execution node methods, fix last_seen clustering

* Use update_fields to make it clear save only affects capacity

* Restructing to pass unit tests

* Fix bug where a PATCH did not update capacity value
This commit is contained in:
Alan Rominger
2021-08-17 11:52:57 -04:00
parent 1a9fcdccc2
commit 928c35ede5
11 changed files with 246 additions and 127 deletions

View File

@@ -21,7 +21,7 @@ from awx.main.managers import InstanceManager, InstanceGroupManager
from awx.main.fields import JSONField
from awx.main.models.base import BaseModel, HasEditsMixin, prevent_search
from awx.main.models.unified_jobs import UnifiedJob
from awx.main.utils import get_cpu_capacity, get_mem_capacity, get_system_task_capacity
from awx.main.utils.common import measure_cpu, get_corrected_cpu, get_cpu_effective_capacity, measure_memory, get_corrected_memory, get_mem_effective_capacity
from awx.main.models.mixins import RelatedJobsMixin
__all__ = ('Instance', 'InstanceGroup', 'TowerScheduleState')
@@ -52,6 +52,7 @@ class Instance(HasPolicyEditsMixin, BaseModel):
objects = InstanceManager()
# Fields set in instance registration
uuid = models.CharField(max_length=40)
hostname = models.CharField(max_length=250, unique=True)
ip_address = models.CharField(
@@ -61,16 +62,11 @@ class Instance(HasPolicyEditsMixin, BaseModel):
max_length=50,
unique=True,
)
# Auto-fields, implementation is different from BaseModel
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
# Fields defined in health check or heartbeat
version = models.CharField(max_length=120, blank=True)
capacity = models.PositiveIntegerField(
default=100,
editable=False,
)
capacity_adjustment = models.DecimalField(default=Decimal(1.0), max_digits=3, decimal_places=2, validators=[MinValueValidator(0)])
enabled = models.BooleanField(default=True)
managed_by_policy = models.BooleanField(default=True)
cpu = models.IntegerField(
default=0,
editable=False,
@@ -78,7 +74,22 @@ class Instance(HasPolicyEditsMixin, BaseModel):
memory = models.BigIntegerField(
default=0,
editable=False,
help_text=_('Total system memory of this instance in bytes.'),
)
last_seen = models.DateTimeField(
null=True,
editable=False,
help_text=_('Last time instance ran its heartbeat task for main cluster nodes. Last known connection to receptor mesh for execution nodes.'),
)
# Capacity management
capacity = models.PositiveIntegerField(
default=100,
editable=False,
)
capacity_adjustment = models.DecimalField(default=Decimal(1.0), max_digits=3, decimal_places=2, validators=[MinValueValidator(0)])
enabled = models.BooleanField(default=True)
managed_by_policy = models.BooleanField(default=True)
cpu_capacity = models.IntegerField(
default=0,
editable=False,
@@ -126,39 +137,83 @@ class Instance(HasPolicyEditsMixin, BaseModel):
return random.choice(Instance.objects.filter(enabled=True).filter(node_type__in=['control', 'hybrid']).values_list('hostname', flat=True))
def is_lost(self, ref_time=None):
if self.last_seen is None:
return True
if ref_time is None:
ref_time = now()
grace_period = 120
return self.modified < ref_time - timedelta(seconds=grace_period)
grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * 2
if self.node_type == 'execution':
grace_period += settings.RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD
return self.last_seen < ref_time - timedelta(seconds=grace_period)
def mark_offline(self, on_good_terms=False):
self.cpu = self.cpu_capacity = self.memory = self.mem_capacity = self.capacity = 0
update_fields = ['capacity', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity']
if on_good_terms:
update_fields.append('modified')
self.save()
def mark_offline(self, update_last_seen=False, perform_save=True):
if self.cpu_capacity == 0 and self.mem_capacity == 0 and self.capacity == 0 and (not update_last_seen):
return
self.cpu_capacity = self.mem_capacity = self.capacity = 0
if update_last_seen:
self.last_seen = now()
def refresh_capacity(self):
cpu = get_cpu_capacity()
mem = get_mem_capacity()
if perform_save:
update_fields = ['capacity', 'cpu_capacity', 'mem_capacity']
if update_last_seen:
update_fields += ['last_seen']
self.save(update_fields=update_fields)
def set_capacity_value(self):
"""Sets capacity according to capacity adjustment rule (no save)"""
if self.enabled:
self.capacity = get_system_task_capacity(self.capacity_adjustment)
lower_cap = min(self.mem_capacity, self.cpu_capacity)
higher_cap = max(self.mem_capacity, self.cpu_capacity)
self.capacity = lower_cap + (higher_cap - lower_cap) * self.capacity_adjustment
else:
self.capacity = 0
def refresh_capacity_fields(self):
"""Update derived capacity fields from cpu and memory (no save)"""
self.cpu_capacity = get_cpu_effective_capacity(self.cpu)
self.mem_capacity = get_mem_effective_capacity(self.memory)
self.set_capacity_value()
def save_health_data(self, version, cpu, memory, last_seen=None, has_error=False):
update_fields = []
if last_seen is not None and self.last_seen != last_seen:
self.last_seen = last_seen
update_fields.append('last_seen')
if self.version != version:
self.version = version
update_fields.append('version')
new_cpu = get_corrected_cpu(cpu)
if new_cpu != self.cpu:
self.cpu = new_cpu
update_fields.append('cpu')
new_memory = get_corrected_memory(memory)
if new_memory != self.memory:
self.memory = new_memory
update_fields.append('memory')
if not has_error:
self.refresh_capacity_fields()
else:
self.mark_offline(perform_save=False)
update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity'])
self.save(update_fields=update_fields)
def local_health_check(self):
"""Only call this method on the instance that this record represents"""
has_error = False
try:
# if redis is down for some reason, that means we can't persist
# playbook event data; we should consider this a zero capacity event
redis.Redis.from_url(settings.BROKER_URL).ping()
except redis.ConnectionError:
self.capacity = 0
has_error = True
self.cpu = cpu[0]
self.memory = mem[0]
self.cpu_capacity = cpu[1]
self.mem_capacity = mem[1]
self.version = awx_application_version
self.save(update_fields=['capacity', 'version', 'modified', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity'])
self.save_health_data(awx_application_version, measure_cpu(), measure_memory(), last_seen=now(), has_error=has_error)
class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin):