From f597205fa7afd5f98b00bd416707aea7069a0d8a Mon Sep 17 00:00:00 2001
From: Alan Rominger <arominge@redhat.com>
Date: Wed, 21 Jul 2021 09:53:22 -0400
Subject: [PATCH] Run capacity checks with container isolation (#10688)

This requires swapping out the container images
  for the execution nodes from awx-ee to the awx image

For completeness, the hop node image is switched to the raw
  receptor image

A few outright bugs are fixed here
  memory calculation just was not right at all
  the execution_capacity calculation was reverse of intention

Drop in a few TODOs about error handling from debugging
---
 awx/main/models/ha.py     |  2 +-
 awx/main/tasks.py         | 32 +++++++++++++++++++++-----------
 tools/docker-receptor.yml | 20 +++++++++++++-------
 3 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py
index 3ab2439b95..5cb1ec2e09 100644
--- a/awx/main/models/ha.py
+++ b/awx/main/models/ha.py
@@ -209,7 +209,7 @@ class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin):
     @property
     def execution_capacity(self):
         # TODO: update query to exclude based on node_type field
-        return sum([inst.capacity for inst in self.instances.exclude(version__startswith='ansible-runner-')])
+        return sum([inst.capacity for inst in self.instances.filter(version__startswith='ansible-runner-')])
 
     @property
     def jobs_running(self):
diff --git a/awx/main/tasks.py b/awx/main/tasks.py
index 1098fdbfac..07bc6af986 100644
--- a/awx/main/tasks.py
+++ b/awx/main/tasks.py
@@ -59,7 +59,6 @@ from receptorctl.socket_interface import ReceptorControl
 from dateutil.parser import parse as parse_date
 
 # AWX
-from awx import MODE
 from awx import __version__ as awx_application_version
 from awx.main.constants import PRIVILEGE_ESCALATION_METHODS, STANDARD_INVENTORY_UPDATE_ENV, MINIMAL_EVENTS
 from awx.main.access import access_registry
@@ -103,6 +102,7 @@ from awx.main.utils.common import (
     cleanup_new_process,
     create_partition,
     get_cpu_capacity,
+    get_mem_capacity,
     get_system_task_capacity,
 )
 from awx.main.utils.execution_environments import get_default_execution_environment, get_default_pod_spec, CONTAINER_ROOT, to_container_path
@@ -466,6 +466,8 @@ def cluster_node_heartbeat():
         if inst.hostname == settings.CLUSTER_HOST_ID:
             this_inst = inst
             instance_list.remove(inst)
+        elif inst.version.startswith('ansible-runner'):  # TODO: use proper field when introduced
+            continue
         elif inst.is_lost(ref_time=nowtime):
             lost_instances.append(inst)
             instance_list.remove(inst)
@@ -1773,7 +1775,6 @@ class RunJob(BaseTask):
                 ]
             )
 
-        params['process_isolation'] = False if MODE == 'development' else True
         return params
 
     def pre_run_hook(self, job, private_data_dir):
@@ -2847,11 +2848,6 @@ class RunAdHocCommand(BaseTask):
         d[r'Password:\s*?$'] = 'ssh_password'
         return d
 
-    def build_execution_environment_params(self, instance, private_data_dir):
-        params = super(RunAdHocCommand, self).build_execution_environment_params(instance, private_data_dir)
-        params['process_isolation'] = False if MODE == 'development' else True
-        return params
-
 
 @task(queue=get_local_queuename)
 class RunSystemJob(BaseTask):
@@ -3000,7 +2996,8 @@ class AWXReceptorJob:
                 receptor_ctl.simple_command(f"work release {self.unit_id}")
 
     @classmethod
-    def check_heartbeat(cls, node):
+    def check_heartbeat(cls, node):  # TODO: rename most of these "heartbeat" things
+        logger.info(f'Checking capacity of execution node {node}')
         # make a private data dir and env dir
         private_data_dir = tempfile.mkdtemp(prefix='awx_heartbeat_', dir=settings.AWX_ISOLATION_BASE_PATH)
         env_path = os.path.join(private_data_dir, 'env')
@@ -3022,6 +3019,10 @@ class AWXReceptorJob:
         with open(fn, 'w') as f:
             os.chmod(fn, stat.S_IRUSR | stat.S_IXUSR | stat.S_IWUSR)
             f.write('localhost ansible_connection=local')
+        # we have to create the project directory because it is --workdir and crun needs it to exist
+        # https://github.com/ansible/ansible-runner/issues/758
+        project_path = os.path.join(private_data_dir, 'project')
+        os.makedirs(project_path, mode=0o700)
 
         runner_params = {
             'ident': str(uuid4()),
@@ -3033,7 +3034,7 @@ class AWXReceptorJob:
             'settings': {
                 "container_image": get_default_execution_environment().image,
                 "container_options": ['--user=root'],
-                "process_isolation": False if MODE == 'development' else True,
+                "process_isolation": True,
             },
         }
 
@@ -3067,6 +3068,7 @@ class AWXReceptorJob:
                         version = facts.get('ansible_local', {}).get('ansible_runner', {}).get('version', '')  # noqa
                         if version:
                             self.version = f'ansible-runner-{version}'
+                # TODO: save event_data["stdout"] and log when errors happen
 
             def finished_callback(self, runner_obj):
                 pass
@@ -3075,6 +3077,7 @@ class AWXReceptorJob:
                 pass
 
             def status_handler(self, status_data, runner_config):
+                # TODO: log error cases
                 pass
 
             def update_model(self, *args, **kw):
@@ -3085,12 +3088,13 @@ class AWXReceptorJob:
         res = receptor_job.run(work_type='ansible-runner')
         if res.status == 'successful':
             cpu = get_cpu_capacity(task.cpus)
-            mem = get_cpu_capacity(task.mem_mb)
+            mem = get_mem_capacity(task.mem_mb * 1000000)
+            logger.info(f'Calculated memory capacity: {task.mem_mb}, out: {mem}')
             instance = Instance.objects.get(hostname=node)
             instance.cpu = cpu[0]
             instance.cpu_capacity = cpu[1]
             instance.memory = mem[0]
-            instance.memory_capacity = mem[1]
+            instance.mem_capacity = mem[1]
             instance.capacity = get_system_task_capacity(
                 instance.capacity_adjustment,
                 instance.cpu_capacity,
@@ -3098,6 +3102,12 @@ class AWXReceptorJob:
             )
             instance.version = task.version
             instance.save(update_fields=['capacity', 'version', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity'])
+            logger.info(f'Updated capacity of {node} to cpu: {instance.cpu_capacity} mem: {instance.mem_capacity}')
+        else:
+            # TODO: error handling like we do with jobs
+            # receptorctl work results
+            # receptorctl work list
+            logger.info(f'Capacity check not successful for execution node {node}')
 
     def _run_internal(self, receptor_ctl, work_type=None):
         # Create a socketpair. Where the left side will be used for writing our payload
diff --git a/tools/docker-receptor.yml b/tools/docker-receptor.yml
index 02a30cbf66..57e3ca657d 100644
--- a/tools/docker-receptor.yml
+++ b/tools/docker-receptor.yml
@@ -17,7 +17,7 @@ services:
     volumes:
       - "./docker-compose-cluster:/etc/receptor"
   receptor-hop:
-    image: quay.io/ansible/awx-ee:devel
+    image: quay.io/project-receptor/receptor:latest
     user: root
     container_name: tools_receptor_hop
     hostname: receptor-hop
@@ -29,8 +29,8 @@ services:
     volumes:
       - "../../docker-compose-cluster:/etc/receptor"
   receptor-1:
-    image: quay.io/ansible/awx-ee:devel
-    user: root
+    image: quay.io/awx/awx_devel:devel
+    user: "1000"
     container_name: tools_receptor_1
     hostname: receptor-1
     command: 'receptor --config /etc/receptor/awx-1-receptor.conf'
@@ -38,9 +38,11 @@ services:
       - receptor-hop
     volumes:
       - "../../docker-compose-cluster:/etc/receptor"
+      - "/sys/fs/cgroup:/sys/fs/cgroup"
+    privileged: true
   receptor-2:
-    image: quay.io/ansible/awx-ee:devel
-    user: root
+    image: quay.io/awx/awx_devel:devel
+    user: "1000"
     container_name: tools_receptor_2
     hostname: receptor-2
     command: 'receptor --config /etc/receptor/awx-2-receptor.conf'
@@ -48,9 +50,11 @@ services:
       - receptor-hop
     volumes:
       - "../../docker-compose-cluster:/etc/receptor"
+      - "/sys/fs/cgroup:/sys/fs/cgroup"
+    privileged: true
   receptor-3:
-    image: quay.io/ansible/awx-ee:devel
-    user: root
+    image: quay.io/awx/awx_devel:devel
+    user: "1000"
     container_name: tools_receptor_3
     hostname: receptor-3
     command: 'receptor --config /etc/receptor/awx-3-receptor.conf'
@@ -58,3 +62,5 @@ services:
       - receptor-hop
     volumes:
       - "../../docker-compose-cluster:/etc/receptor"
+      - "/sys/fs/cgroup:/sys/fs/cgroup"
+    privileged: true