remove order by from partitioned events query

* The order by results in an in-memory sort that COULD blow out the
worker mem buffer and result in sorting having to take place on disk.
* This WILL happen with a default postgres 4MB mem buffer. We saw as
much as 20MB used. Note that AWX defaults postgres mem worker buffer to
3% of the DB memory on external installs and 1% on same-node installs.
So for a 16GB remote DB this would not be a problem.
* We are going to avoid this problem all together by NOT doing a sort
when gathering. Instead, we will sort remotely, in analytics.
This commit is contained in:
Chris Meyers
2021-04-26 13:31:48 -04:00
committed by Jim Ladd
parent 4f058245e4
commit c5a1e4c704

View File

@@ -356,37 +356,37 @@ def _copy_table(table, query, path):
return file.file_list() return file.file_list()
def _events_table(since, full_path, until, tbl, project_job_created=False, **kwargs): def _events_table(since, full_path, until, tbl, project_job_created=False, order_by=True, **kwargs):
def query(event_data): def query(event_data):
# TODO: conditional job_created based on if the column exists or not in the table query = f'''COPY (SELECT {tbl}.id,
# {tbl}.job_created, {tbl}.created,
{tbl}.modified,
return f'''COPY (SELECT {tbl}.id, {tbl + '.job_created' if project_job_created else 'NULL'} as job_created,
{tbl}.created, {tbl}.uuid,
{tbl}.modified, {tbl}.parent_uuid,
{tbl + '.job_created' if project_job_created else 'NULL'} as job_created, {tbl}.event,
{tbl}.uuid, task_action,
{tbl}.parent_uuid, (CASE WHEN event = 'playbook_on_stats' THEN event_data END) as playbook_on_stats,
{tbl}.event, {tbl}.failed,
task_action, {tbl}.changed,
(CASE WHEN event = 'playbook_on_stats' THEN event_data END) as playbook_on_stats, {tbl}.playbook,
{tbl}.failed, {tbl}.play,
{tbl}.changed, {tbl}.task,
{tbl}.playbook, {tbl}.role,
{tbl}.play, {tbl}.job_id,
{tbl}.task, {tbl}.host_id,
{tbl}.role, {tbl}.host_name,
{tbl}.job_id, CAST(x.start AS TIMESTAMP WITH TIME ZONE) AS start,
{tbl}.host_id, CAST(x.end AS TIMESTAMP WITH TIME ZONE) AS end,
{tbl}.host_name, x.duration AS duration,
CAST(x.start AS TIMESTAMP WITH TIME ZONE) AS start, x.res->'warnings' AS warnings,
CAST(x.end AS TIMESTAMP WITH TIME ZONE) AS end, x.res->'deprecations' AS deprecations
x.duration AS duration, FROM {tbl}, json_to_record({event_data}) AS x("res" json, "duration" text, "task_action" text, "start" text, "end" text)
x.res->'warnings' AS warnings, WHERE ({tbl}.id > {since} AND {tbl}.id <= {until})'''
x.res->'deprecations' AS deprecations if order_by:
FROM {tbl}, json_to_record({event_data}) AS x("res" json, "duration" text, "task_action" text, "start" text, "end" text) query += f' ORDER BY {tbl}.id ASC'
WHERE ({tbl}.id > {since} AND {tbl}.id <= {until}) query += ') TO STDOUT WITH CSV HEADER'
ORDER BY {tbl}.id ASC) TO STDOUT WITH CSV HEADER''' return query
try: try:
return _copy_table(table='events', query=query(f"{tbl}.event_data::json"), path=full_path) return _copy_table(table='events', query=query(f"{tbl}.event_data::json"), path=full_path)
@@ -401,7 +401,7 @@ def events_table_unpartitioned(since, full_path, until, **kwargs):
@register('events_table', '1.3', format='csv', description=_('Automation task records'), expensive=events_slicing_partitioned_modified) @register('events_table', '1.3', format='csv', description=_('Automation task records'), expensive=events_slicing_partitioned_modified)
def events_table_partitioned_modified(since, full_path, until, **kwargs): def events_table_partitioned_modified(since, full_path, until, **kwargs):
return _events_table(since, full_path, until, 'main_jobevent', project_job_created=True, **kwargs) return _events_table(since, full_path, until, 'main_jobevent', project_job_created=True, order_by=False, **kwargs)
@register('unified_jobs_table', '1.2', format='csv', description=_('Data on jobs run'), expensive=four_hour_slicing) @register('unified_jobs_table', '1.2', format='csv', description=_('Data on jobs run'), expensive=four_hour_slicing)