Merge pull request #4716 from jladdjr/perf_stats

Enable collection of performance stats
This commit is contained in:
Ryan Petrello
2019-10-04 17:09:30 -04:00
committed by GitHub
10 changed files with 158 additions and 4 deletions

View File

@@ -10,8 +10,8 @@ from django.utils.translation import ugettext_lazy as _
# Django REST Framework
from rest_framework.fields import ( # noqa
BooleanField, CharField, ChoiceField, DictField, EmailField, IntegerField,
ListField, NullBooleanField
BooleanField, CharField, ChoiceField, DictField, EmailField,
IntegerField, ListField, NullBooleanField
)
logger = logging.getLogger('awx.conf.fields')

View File

@@ -8,6 +8,7 @@ from django.utils.translation import ugettext_lazy as _
# Django REST Framework
from rest_framework import serializers
from rest_framework.fields import FloatField
# Tower
from awx.conf import fields, register, register_validate
@@ -345,6 +346,49 @@ register(
category_slug='jobs',
)
register(
'AWX_RESOURCE_PROFILING_ENABLED',
field_class=fields.BooleanField,
default=False,
label=_('Enable resource profiling on all tower jobs'),
help_text=_('If set, resource profiling data will be collected on all jobs.'), # noqa
category=_('Jobs'),
category_slug='jobs',
)
register(
'AWX_RESOURCE_PROFILING_CPU_POLL_INTERVAL',
field_class=FloatField,
default='0.25',
label=_('Interval (in seconds) between polls for cpu usage.'),
help_text=_('Interval (in seconds) between polls for cpu usage.'),
category=_('Jobs'),
category_slug='jobs',
required=False,
)
register(
'AWX_RESOURCE_PROFILING_MEMORY_POLL_INTERVAL',
field_class=FloatField,
default='0.25',
label=_('Interval (in seconds) between polls for memory usage.'),
help_text=_('Interval (in seconds) between polls for memory usage.'),
category=_('Jobs'),
category_slug='jobs',
required=False,
)
register(
'AWX_RESOURCE_PROFILING_PID_POLL_INTERVAL',
field_class=FloatField,
default='0.25',
label=_('Interval (in seconds) between polls for PID count.'),
help_text=_('Interval (in seconds) between polls for PID count.'),
category=_('Jobs'),
category_slug='jobs',
required=False,
)
register(
'AWX_TASK_ENV',
field_class=fields.KeyValueField,

View File

@@ -908,6 +908,31 @@ class BaseTask(object):
process_isolation_params['process_isolation_ro_paths'].append(instance.ansible_virtualenv_path)
return process_isolation_params
def build_params_resource_profiling(self, instance, private_data_dir):
resource_profiling_params = {}
if self.should_use_resource_profiling(instance):
cpu_poll_interval = settings.AWX_RESOURCE_PROFILING_CPU_POLL_INTERVAL
mem_poll_interval = settings.AWX_RESOURCE_PROFILING_MEMORY_POLL_INTERVAL
pid_poll_interval = settings.AWX_RESOURCE_PROFILING_PID_POLL_INTERVAL
results_dir = os.path.join(private_data_dir, 'artifacts/playbook_profiling')
if not os.path.isdir(results_dir):
os.makedirs(results_dir, stat.S_IREAD | stat.S_IWRITE | stat.S_IEXEC)
logger.debug('Collected the following resource profiling intervals: cpu: {} mem: {} pid: {}'
.format(cpu_poll_interval, mem_poll_interval, pid_poll_interval))
resource_profiling_params.update({'resource_profiling': True,
'resource_profiling_base_cgroup': 'ansible-runner',
'resource_profiling_cpu_poll_interval': cpu_poll_interval,
'resource_profiling_memory_poll_interval': mem_poll_interval,
'resource_profiling_pid_poll_interval': pid_poll_interval,
'resource_profiling_results_dir': results_dir})
else:
logger.debug('Resource profiling not enabled for task')
return resource_profiling_params
def _write_extra_vars_file(self, private_data_dir, vars, safe_dict={}):
env_path = os.path.join(private_data_dir, 'env')
try:
@@ -968,6 +993,12 @@ class BaseTask(object):
env['AWX_PRIVATE_DATA_DIR'] = private_data_dir
return env
def should_use_resource_profiling(self, job):
'''
Return whether this task should use resource profiling
'''
return False
def should_use_proot(self, instance):
'''
Return whether this task should use proot.
@@ -1052,6 +1083,12 @@ class BaseTask(object):
'''
Hook for any steps to run after job/task is marked as complete.
'''
job_profiling_dir = os.path.join(private_data_dir, 'artifacts/playbook_profiling')
awx_profiling_dir = '/var/log/tower/playbook_profiling/'
if not os.path.exists(awx_profiling_dir):
os.mkdir(awx_profiling_dir)
if os.path.isdir(job_profiling_dir):
shutil.copytree(job_profiling_dir, os.path.join(awx_profiling_dir, str(instance.pk)))
def event_handler(self, event_data):
#
@@ -1205,6 +1242,8 @@ class BaseTask(object):
self.build_extra_vars_file(self.instance, private_data_dir)
args = self.build_args(self.instance, private_data_dir, passwords)
cwd = self.build_cwd(self.instance, private_data_dir)
resource_profiling_params = self.build_params_resource_profiling(self.instance,
private_data_dir)
process_isolation_params = self.build_params_process_isolation(self.instance,
private_data_dir,
cwd)
@@ -1244,6 +1283,7 @@ class BaseTask(object):
'pexpect_timeout': getattr(settings, 'PEXPECT_TIMEOUT', 5),
'suppress_ansible_output': True,
**process_isolation_params,
**resource_profiling_params,
},
}
@@ -1612,6 +1652,12 @@ class RunJob(BaseTask):
d[r'Vault password \({}\):\s*?$'.format(vault_id)] = k
return d
def should_use_resource_profiling(self, job):
'''
Return whether this task should use resource profiling
'''
return settings.AWX_RESOURCE_PROFILING_ENABLED
def should_use_proot(self, job):
'''
Return whether this task should use proot.

View File

@@ -469,6 +469,36 @@ class TestGenericRun():
assert '/AWX_VENV_PATH' in process_isolation_params['process_isolation_ro_paths']
assert 2 == len(process_isolation_params['process_isolation_ro_paths'])
@mock.patch('os.makedirs')
def test_build_params_resource_profiling(self, os_makedirs):
job = Job(project=Project(), inventory=Inventory())
task = tasks.RunJob()
task.should_use_resource_profiling = lambda job: True
task.instance = job
resource_profiling_params = task.build_params_resource_profiling(task.instance, '/fake_private_data_dir')
assert resource_profiling_params['resource_profiling'] is True
assert resource_profiling_params['resource_profiling_base_cgroup'] == 'ansible-runner'
assert resource_profiling_params['resource_profiling_cpu_poll_interval'] == '0.25'
assert resource_profiling_params['resource_profiling_memory_poll_interval'] == '0.25'
assert resource_profiling_params['resource_profiling_pid_poll_interval'] == '0.25'
assert resource_profiling_params['resource_profiling_results_dir'] == '/fake_private_data_dir/artifacts/playbook_profiling'
@pytest.mark.parametrize("scenario, profiling_enabled", [
('global_setting', True),
('default', False)])
def test_should_use_resource_profiling(self, scenario, profiling_enabled, settings):
job = Job(project=Project(), inventory=Inventory())
task = tasks.RunJob()
task.instance = job
if scenario == 'global_setting':
settings.AWX_RESOURCE_PROFILING_ENABLED = True
assert task.should_use_resource_profiling(task.instance) == profiling_enabled
def test_created_by_extra_vars(self):
job = Job(created_by=User(pk=123, username='angry-spud'))

View File

@@ -640,6 +640,18 @@ AWX_PROOT_SHOW_PATHS = []
# Note: This setting may be overridden by database settings.
AWX_PROOT_BASE_PATH = "/tmp"
# Disable resource profiling by default
AWX_RESOURCE_PROFILING_ENABLED = False
# Interval (in seconds) between polls for cpu usage
AWX_RESOURCE_PROFILING_CPU_POLL_INTERVAL = '0.25'
# Interval (in seconds) between polls for memory usage
AWX_RESOURCE_PROFILING_MEMORY_POLL_INTERVAL = '0.25'
# Interval (in seconds) between polls for PID count
AWX_RESOURCE_PROFILING_PID_POLL_INTERVAL = '0.25'
# User definable ansible callback plugins
# Note: This setting may be overridden by database settings.
AWX_ANSIBLE_CALLBACK_PLUGINS = ""

View File

@@ -85,6 +85,9 @@ export default ['i18n', function(i18n) {
AWX_ISOLATED_CONNECTION_TIMEOUT: {
type: 'text',
reset: 'AWX_ISOLATED_CONNECTION_TIMEOUT'
},
AWX_RESOURCE_PROFILING_ENABLED: {
type: 'toggleSwitch',
}
},
buttons: {

18
docs/performance_data.md Normal file
View File

@@ -0,0 +1,18 @@
Performance Data
================
AWX has the ability to collect performance data on job runs.
The following data is collected periodically (with a default interval of every 0.25 seconds):
* CPU usage
* Memory usage
* PID count
The data is stored under `/var/log/tower/playbook_profiling`. A new folder is created for each job run. The folder's name is set to the job's ID.
Performance data collection is not enabled by default. To enable performance data collection on all jobs, set AWX_RESOURCE_PROFILING_ENABLED to true.
The frequency with which data is collected can be set using:
* AWX_RESOURCE_PROFILING_CPU_POLL_INTERVAL
* AWX_RESOURCE_PROFILING_MEMORY_POLL_INTERVAL
* AWX_RESOURCE_PROFILING_PID_POLL_INTERVAL

View File

@@ -1,4 +1,4 @@
ansible-runner==1.3.4
ansible-runner==1.4.1
appdirs==1.4.2
asgi-amqp==1.1.3
azure-keyvault==1.1.0

View File

@@ -1,6 +1,6 @@
adal==1.2.1 # via msrestazure
amqp==2.4.2 # via kombu
ansible-runner==1.3.4
ansible-runner==1.4.1
appdirs==1.4.2
argparse==1.4.0 # via uwsgitop
asgi-amqp==1.1.3

View File

@@ -34,6 +34,7 @@ RUN yum -y install acl \
libselinux-python \
libstdc++.so.6 \
libtool-ltdl-devel \
libcgroup-tools \
libXcomposite \
libXcursor \
libXdamage \