diff --git a/awx/conf/fields.py b/awx/conf/fields.py index ca731de579..c5516b7c2e 100644 --- a/awx/conf/fields.py +++ b/awx/conf/fields.py @@ -10,8 +10,8 @@ from django.utils.translation import ugettext_lazy as _ # Django REST Framework from rest_framework.fields import ( # noqa - BooleanField, CharField, ChoiceField, DictField, EmailField, IntegerField, - ListField, NullBooleanField + BooleanField, CharField, ChoiceField, DictField, EmailField, + IntegerField, ListField, NullBooleanField ) logger = logging.getLogger('awx.conf.fields') diff --git a/awx/main/conf.py b/awx/main/conf.py index f528821b8e..3c3f53cadc 100644 --- a/awx/main/conf.py +++ b/awx/main/conf.py @@ -8,6 +8,7 @@ from django.utils.translation import ugettext_lazy as _ # Django REST Framework from rest_framework import serializers +from rest_framework.fields import FloatField # Tower from awx.conf import fields, register, register_validate @@ -345,6 +346,49 @@ register( category_slug='jobs', ) +register( + 'AWX_RESOURCE_PROFILING_ENABLED', + field_class=fields.BooleanField, + default=False, + label=_('Enable resource profiling on all tower jobs'), + help_text=_('If set, resource profiling data will be collected on all jobs.'), # noqa + category=_('Jobs'), + category_slug='jobs', +) + +register( + 'AWX_RESOURCE_PROFILING_CPU_POLL_INTERVAL', + field_class=FloatField, + default='0.25', + label=_('Interval (in seconds) between polls for cpu usage.'), + help_text=_('Interval (in seconds) between polls for cpu usage.'), + category=_('Jobs'), + category_slug='jobs', + required=False, +) + +register( + 'AWX_RESOURCE_PROFILING_MEMORY_POLL_INTERVAL', + field_class=FloatField, + default='0.25', + label=_('Interval (in seconds) between polls for memory usage.'), + help_text=_('Interval (in seconds) between polls for memory usage.'), + category=_('Jobs'), + category_slug='jobs', + required=False, +) + +register( + 'AWX_RESOURCE_PROFILING_PID_POLL_INTERVAL', + field_class=FloatField, + default='0.25', + label=_('Interval (in seconds) between polls for PID count.'), + help_text=_('Interval (in seconds) between polls for PID count.'), + category=_('Jobs'), + category_slug='jobs', + required=False, +) + register( 'AWX_TASK_ENV', field_class=fields.KeyValueField, diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 03f0c8afe7..ad396a45cf 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -908,6 +908,31 @@ class BaseTask(object): process_isolation_params['process_isolation_ro_paths'].append(instance.ansible_virtualenv_path) return process_isolation_params + def build_params_resource_profiling(self, instance, private_data_dir): + resource_profiling_params = {} + if self.should_use_resource_profiling(instance): + cpu_poll_interval = settings.AWX_RESOURCE_PROFILING_CPU_POLL_INTERVAL + mem_poll_interval = settings.AWX_RESOURCE_PROFILING_MEMORY_POLL_INTERVAL + pid_poll_interval = settings.AWX_RESOURCE_PROFILING_PID_POLL_INTERVAL + + results_dir = os.path.join(private_data_dir, 'artifacts/playbook_profiling') + if not os.path.isdir(results_dir): + os.makedirs(results_dir, stat.S_IREAD | stat.S_IWRITE | stat.S_IEXEC) + + logger.debug('Collected the following resource profiling intervals: cpu: {} mem: {} pid: {}' + .format(cpu_poll_interval, mem_poll_interval, pid_poll_interval)) + + resource_profiling_params.update({'resource_profiling': True, + 'resource_profiling_base_cgroup': 'ansible-runner', + 'resource_profiling_cpu_poll_interval': cpu_poll_interval, + 'resource_profiling_memory_poll_interval': mem_poll_interval, + 'resource_profiling_pid_poll_interval': pid_poll_interval, + 'resource_profiling_results_dir': results_dir}) + else: + logger.debug('Resource profiling not enabled for task') + + return resource_profiling_params + def _write_extra_vars_file(self, private_data_dir, vars, safe_dict={}): env_path = os.path.join(private_data_dir, 'env') try: @@ -968,6 +993,12 @@ class BaseTask(object): env['AWX_PRIVATE_DATA_DIR'] = private_data_dir return env + def should_use_resource_profiling(self, job): + ''' + Return whether this task should use resource profiling + ''' + return False + def should_use_proot(self, instance): ''' Return whether this task should use proot. @@ -1052,6 +1083,12 @@ class BaseTask(object): ''' Hook for any steps to run after job/task is marked as complete. ''' + job_profiling_dir = os.path.join(private_data_dir, 'artifacts/playbook_profiling') + awx_profiling_dir = '/var/log/tower/playbook_profiling/' + if not os.path.exists(awx_profiling_dir): + os.mkdir(awx_profiling_dir) + if os.path.isdir(job_profiling_dir): + shutil.copytree(job_profiling_dir, os.path.join(awx_profiling_dir, str(instance.pk))) def event_handler(self, event_data): # @@ -1205,6 +1242,8 @@ class BaseTask(object): self.build_extra_vars_file(self.instance, private_data_dir) args = self.build_args(self.instance, private_data_dir, passwords) cwd = self.build_cwd(self.instance, private_data_dir) + resource_profiling_params = self.build_params_resource_profiling(self.instance, + private_data_dir) process_isolation_params = self.build_params_process_isolation(self.instance, private_data_dir, cwd) @@ -1244,6 +1283,7 @@ class BaseTask(object): 'pexpect_timeout': getattr(settings, 'PEXPECT_TIMEOUT', 5), 'suppress_ansible_output': True, **process_isolation_params, + **resource_profiling_params, }, } @@ -1612,6 +1652,12 @@ class RunJob(BaseTask): d[r'Vault password \({}\):\s*?$'.format(vault_id)] = k return d + def should_use_resource_profiling(self, job): + ''' + Return whether this task should use resource profiling + ''' + return settings.AWX_RESOURCE_PROFILING_ENABLED + def should_use_proot(self, job): ''' Return whether this task should use proot. diff --git a/awx/main/tests/unit/test_tasks.py b/awx/main/tests/unit/test_tasks.py index c242ee6910..4c49af8b5d 100644 --- a/awx/main/tests/unit/test_tasks.py +++ b/awx/main/tests/unit/test_tasks.py @@ -469,6 +469,36 @@ class TestGenericRun(): assert '/AWX_VENV_PATH' in process_isolation_params['process_isolation_ro_paths'] assert 2 == len(process_isolation_params['process_isolation_ro_paths']) + + @mock.patch('os.makedirs') + def test_build_params_resource_profiling(self, os_makedirs): + job = Job(project=Project(), inventory=Inventory()) + task = tasks.RunJob() + task.should_use_resource_profiling = lambda job: True + task.instance = job + + resource_profiling_params = task.build_params_resource_profiling(task.instance, '/fake_private_data_dir') + assert resource_profiling_params['resource_profiling'] is True + assert resource_profiling_params['resource_profiling_base_cgroup'] == 'ansible-runner' + assert resource_profiling_params['resource_profiling_cpu_poll_interval'] == '0.25' + assert resource_profiling_params['resource_profiling_memory_poll_interval'] == '0.25' + assert resource_profiling_params['resource_profiling_pid_poll_interval'] == '0.25' + assert resource_profiling_params['resource_profiling_results_dir'] == '/fake_private_data_dir/artifacts/playbook_profiling' + + + @pytest.mark.parametrize("scenario, profiling_enabled", [ + ('global_setting', True), + ('default', False)]) + def test_should_use_resource_profiling(self, scenario, profiling_enabled, settings): + job = Job(project=Project(), inventory=Inventory()) + task = tasks.RunJob() + task.instance = job + + if scenario == 'global_setting': + settings.AWX_RESOURCE_PROFILING_ENABLED = True + + assert task.should_use_resource_profiling(task.instance) == profiling_enabled + def test_created_by_extra_vars(self): job = Job(created_by=User(pk=123, username='angry-spud')) diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index ceb482a314..fb24b7014f 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -640,6 +640,18 @@ AWX_PROOT_SHOW_PATHS = [] # Note: This setting may be overridden by database settings. AWX_PROOT_BASE_PATH = "/tmp" +# Disable resource profiling by default +AWX_RESOURCE_PROFILING_ENABLED = False + +# Interval (in seconds) between polls for cpu usage +AWX_RESOURCE_PROFILING_CPU_POLL_INTERVAL = '0.25' + +# Interval (in seconds) between polls for memory usage +AWX_RESOURCE_PROFILING_MEMORY_POLL_INTERVAL = '0.25' + +# Interval (in seconds) between polls for PID count +AWX_RESOURCE_PROFILING_PID_POLL_INTERVAL = '0.25' + # User definable ansible callback plugins # Note: This setting may be overridden by database settings. AWX_ANSIBLE_CALLBACK_PLUGINS = "" diff --git a/awx/ui/client/src/configuration/forms/jobs-form/configuration-jobs.form.js b/awx/ui/client/src/configuration/forms/jobs-form/configuration-jobs.form.js index d84c5bf9a6..4c844a7b7d 100644 --- a/awx/ui/client/src/configuration/forms/jobs-form/configuration-jobs.form.js +++ b/awx/ui/client/src/configuration/forms/jobs-form/configuration-jobs.form.js @@ -85,6 +85,9 @@ export default ['i18n', function(i18n) { AWX_ISOLATED_CONNECTION_TIMEOUT: { type: 'text', reset: 'AWX_ISOLATED_CONNECTION_TIMEOUT' + }, + AWX_RESOURCE_PROFILING_ENABLED: { + type: 'toggleSwitch', } }, buttons: { diff --git a/docs/performance_data.md b/docs/performance_data.md new file mode 100644 index 0000000000..819de15fcc --- /dev/null +++ b/docs/performance_data.md @@ -0,0 +1,18 @@ +Performance Data +================ + +AWX has the ability to collect performance data on job runs. + +The following data is collected periodically (with a default interval of every 0.25 seconds): +* CPU usage +* Memory usage +* PID count + +The data is stored under `/var/log/tower/playbook_profiling`. A new folder is created for each job run. The folder's name is set to the job's ID. + +Performance data collection is not enabled by default. To enable performance data collection on all jobs, set AWX_RESOURCE_PROFILING_ENABLED to true. + +The frequency with which data is collected can be set using: +* AWX_RESOURCE_PROFILING_CPU_POLL_INTERVAL +* AWX_RESOURCE_PROFILING_MEMORY_POLL_INTERVAL +* AWX_RESOURCE_PROFILING_PID_POLL_INTERVAL diff --git a/requirements/requirements.in b/requirements/requirements.in index 85c615c682..1b8cc117fb 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -1,4 +1,4 @@ -ansible-runner==1.3.4 +ansible-runner==1.4.1 appdirs==1.4.2 asgi-amqp==1.1.3 azure-keyvault==1.1.0 diff --git a/requirements/requirements.txt b/requirements/requirements.txt index d11a2c17a9..2a9842a510 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,6 +1,6 @@ adal==1.2.1 # via msrestazure amqp==2.4.2 # via kombu -ansible-runner==1.3.4 +ansible-runner==1.4.1 appdirs==1.4.2 argparse==1.4.0 # via uwsgitop asgi-amqp==1.1.3 diff --git a/tools/docker-compose/Dockerfile b/tools/docker-compose/Dockerfile index c5c017fbe7..6e81a9cced 100644 --- a/tools/docker-compose/Dockerfile +++ b/tools/docker-compose/Dockerfile @@ -34,6 +34,7 @@ RUN yum -y install acl \ libselinux-python \ libstdc++.so.6 \ libtool-ltdl-devel \ + libcgroup-tools \ libXcomposite \ libXcursor \ libXdamage \