Fixup conversion of memory and cpu settings to support k8s resource request format (#11725)

fix memory and cpu settings to suport k8s resource request format * fix conversion of memory setting to bytes This setting has not been getting set by default, and needed some fixing up to be compatible with setting the memory in the same way as we set it in the operator, as well as with other changes from last year which assume that ansible runner is returning memory in bytes. This way we can start setting this setting in the operator, and get a more accurate reflection of how much memory is available to the control pod in k8s. On platforms where services are all sharing memory, we deduct a penalty from the memory available. On k8s we don't need to do this because the web, redis, and task containers each have memory allocated to them. * Support CPU setting expressed in units used by k8s This setting has not been getting set by default, and needed some fixing up to be compatible with setting the CPU resource request/limits in the same way as we set it in the resource requests/limits. This way we can start setting this setting in the operator, and get a more accurate reflection of how much cpu is available to the control pod in k8s. Because cpu on k8s can be partial cores, migrate cpu field to decimal. k8s does not allow granularity of less than 100m (equivalent to 0.1 cores), so only store up to 1 decimal place. fix analytics to deal with decimal cpu need to use DjangoJSONEncoder when Decimal fields in data passed to json.dumps
2026-02-25 06:56:00 -03:30 · 2022-02-15 14:08:24 -05:00
parent 3f08e26881
commit 799968460d
7 changed files with 204 additions and 43 deletions
--- a/awx/main/analytics/collectors.py
+++ b/awx/main/analytics/collectors.py
@@ -211,7 +211,7 @@ def projects_by_scm_type(since, **kwargs):
    return counts


-@register('instance_info', '1.1', description=_('Cluster topology and capacity'))
+@register('instance_info', '1.2', description=_('Cluster topology and capacity'))
 def instance_info(since, include_hostnames=False, **kwargs):
    info = {}
    instances = models.Instance.objects.values_list('hostname').values(
--- a/awx/main/analytics/core.py
+++ b/awx/main/analytics/core.py
@@ -90,7 +90,7 @@ def package(target, data, timestamp):
                    if isinstance(item, str):
                        f.add(item, arcname=f'./{name}')
                    else:
-                        buf = json.dumps(item).encode('utf-8')
+                        buf = json.dumps(item, cls=DjangoJSONEncoder).encode('utf-8')
                        info = tarfile.TarInfo(f'./{name}')
                        info.size = len(buf)
                        info.mtime = timestamp.timestamp()
@@ -230,7 +230,7 @@ def gather(dest=None, module=None, subset=None, since=None, until=None, collecti
            try:
                last_entry = max(last_entries.get(key) or last_gather, until - timedelta(weeks=4))
                results = (func(since or last_entry, collection_type=collection_type, until=until), func.__awx_analytics_version__)
-                json.dumps(results)  # throwaway check to see if the data is json-serializable
+                json.dumps(results, cls=DjangoJSONEncoder)  # throwaway check to see if the data is json-serializable
                data[filename] = results
            except Exception:
                logger.exception("Could not generate metric {}".format(filename))
--- a/awx/main/dispatch/pool.py
+++ b/awx/main/dispatch/pool.py
@@ -22,6 +22,7 @@ import psutil

 from awx.main.models import UnifiedJob
 from awx.main.dispatch import reaper
+from awx.main.utils.common import convert_mem_str_to_bytes

 if 'run_callback_receiver' in sys.argv:
    logger = logging.getLogger('awx.main.commands.run_callback_receiver')
@@ -319,7 +320,8 @@ class AutoscalePool(WorkerPool):
        if self.max_workers is None:
            settings_absmem = getattr(settings, 'SYSTEM_TASK_ABS_MEM', None)
            if settings_absmem is not None:
-                total_memory_gb = int(settings_absmem)
+                # There are 1073741824 bytes in a gigabyte. Convert bytes to gigabytes by dividing by 2**30
+                total_memory_gb = convert_mem_str_to_bytes(settings_absmem) // 2**30
            else:
                total_memory_gb = (psutil.virtual_memory().total >> 30) + 1  # noqa: round up
            # 5 workers per GB of total memory
--- a/awx/main/migrations/0158_make_instance_cpu_decimal.py
+++ b/awx/main/migrations/0158_make_instance_cpu_decimal.py
@@ -0,0 +1,19 @@
+# Generated by Django 2.2.24 on 2022-02-14 17:37
+
+from decimal import Decimal
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('main', '0157_inventory_labels'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='instance',
+            name='cpu',
+            field=models.DecimalField(decimal_places=1, default=Decimal('0'), editable=False, max_digits=4),
+        ),
+    ]
--- a/awx/main/models/ha.py
+++ b/awx/main/models/ha.py
@@ -82,8 +82,10 @@ class Instance(HasPolicyEditsMixin, BaseModel):
    modified = models.DateTimeField(auto_now=True)
    # Fields defined in health check or heartbeat
    version = models.CharField(max_length=120, blank=True)
-    cpu = models.IntegerField(
-        default=0,
+    cpu = models.DecimalField(
+        default=Decimal(0.0),
+        max_digits=4,
+        decimal_places=1,
        editable=False,
    )
    memory = models.BigIntegerField(
--- a/awx/main/tests/unit/settings/test_k8s_resource_setttings.py
+++ b/awx/main/tests/unit/settings/test_k8s_resource_setttings.py
@@ -0,0 +1,61 @@
+import pytest
+
+from unittest import mock
+
+from awx.main.utils.common import (
+    convert_mem_str_to_bytes,
+    get_mem_effective_capacity,
+    get_corrected_memory,
+    convert_cpu_str_to_decimal_cpu,
+    get_cpu_effective_capacity,
+    get_corrected_cpu,
+)
+
+
+@pytest.mark.parametrize(
+    "value,converted_value,mem_capacity",
+    [
+        ('2G', 2000000000, 19),
+        ('4G', 4000000000, 38),
+        ('2Gi', 2147483648, 20),
+        ('2.1G', 1, 1),  # expressing memory with non-integers is not supported, and we'll fall back to 1 fork for memory capacity.
+        ('4Gi', 4294967296, 40),
+        ('2M', 2000000, 1),
+        ('3M', 3000000, 1),
+        ('2Mi', 2097152, 1),
+        ('2048Mi', 2147483648, 20),
+        ('4096Mi', 4294967296, 40),
+        ('64G', 64000000000, 610),
+        ('64Garbage', 1, 1),
+    ],
+)
+def test_SYSTEM_TASK_ABS_MEM_conversion(value, converted_value, mem_capacity):
+    with mock.patch('django.conf.settings') as mock_settings:
+        mock_settings.SYSTEM_TASK_ABS_MEM = value
+        mock_settings.SYSTEM_TASK_FORKS_MEM = 100
+        mock_settings.IS_K8S = True
+        assert convert_mem_str_to_bytes(value) == converted_value
+        assert get_corrected_memory(-1) == converted_value
+        assert get_mem_effective_capacity(-1) == mem_capacity
+
+
+@pytest.mark.parametrize(
+    "value,converted_value,cpu_capacity",
+    [
+        ('2', 2.0, 8),
+        ('1.5', 1.5, 6),
+        ('100m', 0.1, 1),
+        ('2000m', 2.0, 8),
+        ('4MillionCPUm', 1.0, 4),  # Any suffix other than 'm' is not supported, we fall back to 1 CPU
+        ('Random', 1.0, 4),  # Any setting value other than integers, floats millicores (e.g 1, 1.0, or 1000m) is not supported, fall back to 1 CPU
+        ('2505m', 2.5, 10),
+        ('1.55', 1.6, 6),
+    ],
+)
+def test_SYSTEM_TASK_ABS_CPU_conversion(value, converted_value, cpu_capacity):
+    with mock.patch('django.conf.settings') as mock_settings:
+        mock_settings.SYSTEM_TASK_ABS_CPU = value
+        mock_settings.SYSTEM_TASK_FORKS_CPU = 4
+        assert convert_cpu_str_to_decimal_cpu(value) == converted_value
+        assert get_corrected_cpu(-1) == converted_value
+        assert get_cpu_effective_capacity(-1) == cpu_capacity
--- a/awx/main/utils/common.py
+++ b/awx/main/utils/common.py
@@ -692,28 +692,33 @@ def parse_yaml_or_json(vars_str, silent_failure=True):
    return vars_dict


-def get_cpu_effective_capacity(cpu_count):
-    from django.conf import settings
+def convert_cpu_str_to_decimal_cpu(cpu_str):
+    """Convert a string indicating cpu units to decimal.

-    settings_abscpu = getattr(settings, 'SYSTEM_TASK_ABS_CPU', None)
-    env_abscpu = os.getenv('SYSTEM_TASK_ABS_CPU', None)
+    Useful for dealing with cpu setting that may be expressed in units compatible with
+    kubernetes.

-    if env_abscpu is not None:
-        return int(env_abscpu)
-    elif settings_abscpu is not None:
-        return int(settings_abscpu)
+    See https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#cpu-units
+    """
+    cpu = cpu_str
+    millicores = False

-    settings_forkcpu = getattr(settings, 'SYSTEM_TASK_FORKS_CPU', None)
-    env_forkcpu = os.getenv('SYSTEM_TASK_FORKS_CPU', None)
+    if cpu_str[-1] == 'm':
+        cpu = cpu_str[:-1]
+        millicores = True

-    if env_forkcpu:
-        forkcpu = int(env_forkcpu)
-    elif settings_forkcpu:
-        forkcpu = int(settings_forkcpu)
-    else:
-        forkcpu = 4
+    try:
+        cpu = float(cpu)
+    except ValueError:
+        cpu = 1.0
+        millicores = False
+        logger.warning(f"Could not convert SYSTEM_TASK_ABS_CPU {cpu_str} to a decimal number, falling back to default of 1 cpu")

-    return cpu_count * forkcpu
+    if millicores:
+        cpu = cpu / 1000
+
+    # Per kubernetes docs, fractional CPU less than .1 are not allowed
+    return max(0.1, round(cpu, 1))


 def get_corrected_cpu(cpu_count):  # formerlly get_cpu_capacity
@@ -725,34 +730,70 @@ def get_corrected_cpu(cpu_count):  # formerlly get_cpu_capacity
    settings_abscpu = getattr(settings, 'SYSTEM_TASK_ABS_CPU', None)
    env_abscpu = os.getenv('SYSTEM_TASK_ABS_CPU', None)

-    if env_abscpu is not None or settings_abscpu is not None:
-        return 0
+    if env_abscpu is not None:
+        return convert_cpu_str_to_decimal_cpu(env_abscpu)
+    elif settings_abscpu is not None:
+        return convert_cpu_str_to_decimal_cpu(settings_abscpu)

    return cpu_count  # no correction


-def get_mem_effective_capacity(mem_mb):
+def get_cpu_effective_capacity(cpu_count):
    from django.conf import settings

-    settings_absmem = getattr(settings, 'SYSTEM_TASK_ABS_MEM', None)
-    env_absmem = os.getenv('SYSTEM_TASK_ABS_MEM', None)
+    cpu_count = get_corrected_cpu(cpu_count)

-    if env_absmem is not None:
-        return int(env_absmem)
-    elif settings_absmem is not None:
-        return int(settings_absmem)
+    settings_forkcpu = getattr(settings, 'SYSTEM_TASK_FORKS_CPU', None)
+    env_forkcpu = os.getenv('SYSTEM_TASK_FORKS_CPU', None)

-    settings_forkmem = getattr(settings, 'SYSTEM_TASK_FORKS_MEM', None)
-    env_forkmem = os.getenv('SYSTEM_TASK_FORKS_MEM', None)
-
-    if env_forkmem:
-        forkmem = int(env_forkmem)
-    elif settings_forkmem:
-        forkmem = int(settings_forkmem)
+    if env_forkcpu:
+        forkcpu = int(env_forkcpu)
+    elif settings_forkcpu:
+        forkcpu = int(settings_forkcpu)
    else:
-        forkmem = 100
+        forkcpu = 4

-    return max(1, ((mem_mb // 1024 // 1024) - 2048) // forkmem)
+    return max(1, int(cpu_count * forkcpu))
+
+
+def convert_mem_str_to_bytes(mem_str):
+    """Convert string with suffix indicating units to memory in bytes (base 2)
+
+    Useful for dealing with memory setting that may be expressed in units compatible with
+    kubernetes.
+
+    See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-memory
+    """
+    # If there is no suffix, the memory sourced from the request is in bytes
+    if mem_str.isdigit():
+        return int(mem_str)
+
+    conversions = {
+        'Ei': lambda x: x * 2**60,
+        'E': lambda x: x * 10**18,
+        'Pi': lambda x: x * 2**50,
+        'P': lambda x: x * 10**15,
+        'Ti': lambda x: x * 2**40,
+        'T': lambda x: x * 10**12,
+        'Gi': lambda x: x * 2**30,
+        'G': lambda x: x * 10**9,
+        'Mi': lambda x: x * 2**20,
+        'M': lambda x: x * 10**6,
+        'Ki': lambda x: x * 2**10,
+        'K': lambda x: x * 10**3,
+    }
+    mem = 0
+    mem_unit = None
+    for i, char in enumerate(mem_str):
+        if not char.isdigit():
+            mem_unit = mem_str[i:]
+            mem = int(mem_str[:i])
+            break
+    if not mem_unit or mem_unit not in conversions.keys():
+        error = f"Unsupported value for SYSTEM_TASK_ABS_MEM: {mem_str}, memory must be expressed in bytes or with known suffix: {conversions.keys()}. Falling back to 1 byte"
+        logger.warning(error)
+        return 1
+    return max(1, conversions[mem_unit](mem))


 def get_corrected_memory(memory):
@@ -761,12 +802,48 @@ def get_corrected_memory(memory):
    settings_absmem = getattr(settings, 'SYSTEM_TASK_ABS_MEM', None)
    env_absmem = os.getenv('SYSTEM_TASK_ABS_MEM', None)

-    if env_absmem is not None or settings_absmem is not None:
-        return 0
+    # Runner returns memory in bytes
+    # so we convert memory from settings to bytes as well.
+    if env_absmem is not None:
+        return convert_mem_str_to_bytes(env_absmem)
+    elif settings_absmem is not None:
+        return convert_mem_str_to_bytes(settings_absmem)

    return memory


+def get_mem_effective_capacity(mem_bytes):
+    from django.conf import settings
+
+    mem_bytes = get_corrected_memory(mem_bytes)
+
+    settings_mem_mb_per_fork = getattr(settings, 'SYSTEM_TASK_FORKS_MEM', None)
+    env_mem_mb_per_fork = os.getenv('SYSTEM_TASK_FORKS_MEM', None)
+
+    if env_mem_mb_per_fork:
+        mem_mb_per_fork = int(env_mem_mb_per_fork)
+    elif settings_mem_mb_per_fork:
+        mem_mb_per_fork = int(settings_mem_mb_per_fork)
+    else:
+        mem_mb_per_fork = 100
+
+    # Per docs, deduct 2GB of memory from the available memory
+    # to cover memory consumption of background tasks when redis/web etc are colocated with
+    # the other control processes
+    memory_penalty_bytes = 2147483648
+    if settings.IS_K8S:
+        # In k8s, this is dealt with differently because
+        # redis and the web containers have their own memory allocation
+        memory_penalty_bytes = 0
+
+    # convert memory to megabytes because our setting of how much memory we
+    # should allocate per fork is in megabytes
+    mem_mb = (mem_bytes - memory_penalty_bytes) // 2**20
+    max_forks_based_on_memory = mem_mb // mem_mb_per_fork
+
+    return max(1, max_forks_based_on_memory)
+
+
 _inventory_updates = threading.local()
 _task_manager = threading.local()