Merge pull request #13315 from fosterseth/update_task_manager_md

update task manager docs after refactoring
update capacity docs to cover hybrid node case
2026-02-08 21:14:47 -03:30 · 2022-12-12 12:42:49 -05:00 · 2022-12-12 12:11:56 -05:00 · 2022-12-12 11:56:40 -05:00 · 2022-12-08 17:50:20 -06:00 · 2022-12-08 12:54:08 -06:00
14 changed files with 432 additions and 103 deletions
--- a/14
+++ b/14
@@ -389,18 +389,18 @@ $(UI_BUILD_FLAG_FILE):
 	$(PYTHON) tools/scripts/compilemessages.py
 	$(NPM_BIN) --prefix awx/ui --loglevel warn run compile-strings
 	$(NPM_BIN) --prefix awx/ui --loglevel warn run build
-	mkdir -p /var/lib/awx/public/static/css
-	mkdir -p /var/lib/awx/public/static/js
-	mkdir -p /var/lib/awx/public/static/media
-	cp -r awx/ui/build/static/css/* /var/lib/awx/public/static/css
-	cp -r awx/ui/build/static/js/* /var/lib/awx/public/static/js
-	cp -r awx/ui/build/static/media/* /var/lib/awx/public/static/media
 	touch $@

 ui-release: $(UI_BUILD_FLAG_FILE)

 ui-devel: awx/ui/node_modules
 	@$(MAKE) -B $(UI_BUILD_FLAG_FILE)
+	mkdir -p /var/lib/awx/public/static/css
+	mkdir -p /var/lib/awx/public/static/js
+	mkdir -p /var/lib/awx/public/static/media
+	cp -r awx/ui/build/static/css/* /var/lib/awx/public/static/css
+	cp -r awx/ui/build/static/js/* /var/lib/awx/public/static/js
+	cp -r awx/ui/build/static/media/* /var/lib/awx/public/static/media

 ui-devel-instrumented: awx/ui/node_modules
 	$(NPM_BIN) --prefix awx/ui --loglevel warn run start-instrumented
@@ -598,7 +598,7 @@ messages:
 	@if [ "$(VENV_BASE)" ]; then \
 		. $(VENV_BASE)/awx/bin/activate; \
 	fi; \
-	$(PYTHON) manage.py makemessages -l $(LANG) --keep-pot
+	$(PYTHON) manage.py makemessages -l en_us --keep-pot

 print-%:
 	@echo $($*)
--- a/awx/main/analytics/broadcast_websocket.py
+++ b/awx/main/analytics/broadcast_websocket.py
@@ -2,6 +2,7 @@ import datetime
 import asyncio
 import logging
 import redis
+import redis.asyncio
 import re

 from prometheus_client import (
@@ -81,7 +82,7 @@ class BroadcastWebsocketStatsManager:

    async def run_loop(self):
        try:
-            redis_conn = await redis.asyncio.create_redis_pool(settings.BROKER_URL)
+            redis_conn = await redis.asyncio.Redis.from_url(settings.BROKER_URL)
            while True:
                stats_data_str = ''.join(stat.serialize() for stat in self._stats.values())
                await redis_conn.set(self._redis_key, stats_data_str)
@@ -121,8 +122,8 @@ class BroadcastWebsocketStats:
            'Number of messages received, to be forwarded, by the broadcast websocket system',
            registry=self._registry,
        )
-        self._messages_received = Gauge(
-            f'awx_{self.remote_name}_messages_received',
+        self._messages_received_current_conn = Gauge(
+            f'awx_{self.remote_name}_messages_received_currrent_conn',
            'Number forwarded messages received by the broadcast websocket system, for the duration of the current connection',
            registry=self._registry,
        )
@@ -143,13 +144,13 @@ class BroadcastWebsocketStats:

    def record_message_received(self):
        self._internal_messages_received_per_minute.record()
-        self._messages_received.inc()
+        self._messages_received_current_conn.inc()
        self._messages_received_total.inc()

    def record_connection_established(self):
        self._connection.state('connected')
        self._connection_start.set_to_current_time()
-        self._messages_received.set(0)
+        self._messages_received_current_conn.set(0)

    def record_connection_lost(self):
        self._connection.state('disconnected')
--- a/awx/main/conf.py
+++ b/awx/main/conf.py
@@ -569,7 +569,7 @@ register(
 register(
    'LOG_AGGREGATOR_LOGGERS',
    field_class=fields.StringListField,
-    default=['awx', 'activity_stream', 'job_events', 'system_tracking'],
+    default=['awx', 'activity_stream', 'job_events', 'system_tracking', 'broadcast_websocket'],
    label=_('Loggers Sending Data to Log Aggregator Form'),
    help_text=_(
        'List of loggers that will send HTTP logs to the collector, these can '
@@ -577,7 +577,8 @@ register(
        'awx - service logs\n'
        'activity_stream - activity stream records\n'
        'job_events - callback data from Ansible job events\n'
-        'system_tracking - facts gathered from scan jobs.'
+        'system_tracking - facts gathered from scan jobs\n'
+        'broadcast_websocket - errors pertaining to websockets broadcast metrics\n'
    ),
    category=_('Logging'),
    category_slug='logging',
--- a/awx/main/tasks/receptor.py
+++ b/awx/main/tasks/receptor.py
@@ -63,7 +63,7 @@ def read_receptor_config():

 def work_signing_enabled(config_data):
    for section in config_data:
-        if 'work-verification' in section:
+        if 'work-signing' in section:
            return True
    return False

--- a/awx/main/wsbroadcast.py
+++ b/awx/main/wsbroadcast.py
@@ -118,7 +118,7 @@ class WebsocketTask:
            logger.warning(f"Connection from {self.name} to {self.remote_host} timed out.")
        except Exception as e:
            # Early on, this is our canary. I'm not sure what exceptions we can really encounter.
-            logger.warning(f"Connection from {self.name} to {self.remote_host} failed for unknown reason: '{e}'.")
+            logger.exception(f"Connection from {self.name} to {self.remote_host} failed for unknown reason: '{e}'.")
        else:
            logger.warning(f"Connection from {self.name} to {self.remote_host} list.")

--- a/awx/settings/defaults.py
+++ b/awx/settings/defaults.py
@@ -853,6 +853,7 @@ LOGGING = {
        'awx.main.signals': {'level': 'INFO'},  # very verbose debug-level logs
        'awx.api.permissions': {'level': 'INFO'},  # very verbose debug-level logs
        'awx.analytics': {'handlers': ['external_logger'], 'level': 'INFO', 'propagate': False},
+        'awx.analytics.broadcast_websocket': {'handlers': ['console', 'file', 'wsbroadcast', 'external_logger'], 'level': 'INFO', 'propagate': False},
        'awx.analytics.performance': {'handlers': ['console', 'file', 'tower_warnings', 'external_logger'], 'level': 'DEBUG', 'propagate': False},
        'awx.analytics.job_lifecycle': {'handlers': ['console', 'job_lifecycle'], 'level': 'DEBUG', 'propagate': False},
        'django_auth_ldap': {'handlers': ['console', 'file', 'tower_warnings'], 'level': 'DEBUG'},
--- a/awx/ui/src/api/models/Hosts.js
+++ b/awx/ui/src/api/models/Hosts.js
@@ -20,6 +20,10 @@ class Hosts extends Base {
    return this.http.get(`${this.baseUrl}${id}/all_groups/`, { params });
  }

+  readGroups(id, params) {
+    return this.http.get(`${this.baseUrl}${id}/groups/`, { params });
+  }
+
  readGroupsOptions(id) {
    return this.http.options(`${this.baseUrl}${id}/groups/`);
  }
--- a/awx/ui/src/screens/Inventory/InventoryHosts/InventoryHostItem.js
+++ b/awx/ui/src/screens/Inventory/InventoryHosts/InventoryHostItem.js
@@ -1,13 +1,17 @@
-import React from 'react';
+import React, { useCallback } from 'react';
 import { string, bool, func } from 'prop-types';
 import { t } from '@lingui/macro';
-import { Button } from '@patternfly/react-core';
 import { Tr, Td } from '@patternfly/react-table';
 import { Link } from 'react-router-dom';
 import { PencilAltIcon } from '@patternfly/react-icons';
-import { ActionsTd, ActionItem, TdBreakWord } from 'components/PaginatedTable';
-
+import { Button, Chip } from '@patternfly/react-core';
+import { HostsAPI } from 'api';
+import AlertModal from 'components/AlertModal';
+import ChipGroup from 'components/ChipGroup';
+import ErrorDetail from 'components/ErrorDetail';
 import HostToggle from 'components/HostToggle';
+import { ActionsTd, ActionItem, TdBreakWord } from 'components/PaginatedTable';
+import useRequest, { useDismissableError } from 'hooks/useRequest';
 import { Host } from 'types';

 function InventoryHostItem({
@@ -19,45 +23,106 @@ function InventoryHostItem({
  rowIndex,
 }) {
  const labelId = `check-action-${host.id}`;
+  const initialGroups = host?.summary_fields?.groups ?? {
+    results: [],
+    count: 0,
+  };
+
+  const {
+    error,
+    request: fetchRelatedGroups,
+    result: relatedGroups,
+  } = useRequest(
+    useCallback(async (hostId) => {
+      const { data } = await HostsAPI.readGroups(hostId);
+      return data.results;
+    }, []),
+    initialGroups.results
+  );
+
+  const { error: dismissableError, dismissError } = useDismissableError(error);
+
+  const handleOverflowChipClick = (hostId) => {
+    if (relatedGroups.length === initialGroups.count) {
+      return;
+    }
+    fetchRelatedGroups(hostId);
+  };

  return (
-    <Tr id={`host-row-${host.id}`} ouiaId={`inventory-host-row-${host.id}`}>
-      <Td
-        data-cy={labelId}
-        select={{
-          rowIndex,
-          isSelected,
-          onSelect,
-        }}
-      />
-      <TdBreakWord id={labelId} dataLabel={t`Name`}>
-        <Link to={`${detailUrl}`}>
-          <b>{host.name}</b>
-        </Link>
-      </TdBreakWord>
-      <TdBreakWord
-        id={`host-description-${host.id}`}
-        dataLabel={t`Description`}
-      >
-        {host.description}
-      </TdBreakWord>
-      <ActionsTd dataLabel={t`Actions`} gridColumns="auto 40px">
-        <HostToggle host={host} />
-        <ActionItem
-          visible={host.summary_fields.user_capabilities?.edit}
-          tooltip={t`Edit host`}
+    <>
+      <Tr id={`host-row-${host.id}`} ouiaId={`inventory-host-row-${host.id}`}>
+        <Td
+          data-cy={labelId}
+          select={{
+            rowIndex,
+            isSelected,
+            onSelect,
+          }}
+        />
+        <TdBreakWord id={labelId} dataLabel={t`Name`}>
+          <Link to={`${detailUrl}`}>
+            <b>{host.name}</b>
+          </Link>
+        </TdBreakWord>
+        <TdBreakWord
+          id={`host-description-${host.id}`}
+          dataLabel={t`Description`}
        >
-          <Button
-            ouiaId={`${host.id}-edit-button`}
-            variant="plain"
-            component={Link}
-            to={`${editUrl}`}
+          {host.description}
+        </TdBreakWord>
+        <TdBreakWord
+          id={`host-related-groups-${host.id}`}
+          dataLabel={t`Related Groups`}
+        >
+          <ChipGroup
+            aria-label={t`Related Groups`}
+            numChips={4}
+            totalChips={initialGroups.count}
+            ouiaId="host-related-groups-chips"
+            onOverflowChipClick={() => handleOverflowChipClick(host.id)}
          >
-            <PencilAltIcon />
-          </Button>
-        </ActionItem>
-      </ActionsTd>
-    </Tr>
+            {relatedGroups.map((group) => (
+              <Chip key={group.name} isReadOnly>
+                {group.name}
+              </Chip>
+            ))}
+          </ChipGroup>
+        </TdBreakWord>
+        <ActionsTd
+          aria-label={t`Actions`}
+          dataLabel={t`Actions`}
+          gridColumns="auto 40px"
+        >
+          <HostToggle host={host} />
+          <ActionItem
+            visible={host.summary_fields.user_capabilities?.edit}
+            tooltip={t`Edit host`}
+          >
+            <Button
+              aria-label={t`Edit host`}
+              ouiaId={`${host.id}-edit-button`}
+              variant="plain"
+              component={Link}
+              to={`${editUrl}`}
+            >
+              <PencilAltIcon />
+            </Button>
+          </ActionItem>
+        </ActionsTd>
+      </Tr>
+      {dismissableError && (
+        <AlertModal
+          isOpen={dismissableError}
+          onClose={dismissError}
+          title={t`Error!`}
+          variant="error"
+        >
+          {t`Failed to load related groups.`}
+          <ErrorDetail error={dismissableError} />
+        </AlertModal>
+      )}
+    </>
  );
 }

--- a/awx/ui/src/screens/Inventory/InventoryHosts/InventoryHostItem.test.js
+++ b/awx/ui/src/screens/Inventory/InventoryHosts/InventoryHostItem.test.js
@@ -1,6 +1,21 @@
 import React from 'react';
-import { mountWithContexts } from '../../../../testUtils/enzymeHelpers';
+import { Router } from 'react-router-dom';
+import {
+  render,
+  fireEvent,
+  screen,
+  waitFor,
+  within,
+} from '@testing-library/react';
+import '@testing-library/jest-dom';
+import { HostsAPI } from 'api';
+import { i18n } from '@lingui/core';
+import { en } from 'make-plural/plurals';
 import InventoryHostItem from './InventoryHostItem';
+import { createMemoryHistory } from 'history';
+import english from '../../../locales/en/messages';
+
+jest.mock('api');

 const mockHost = {
  id: 1,
@@ -24,58 +39,194 @@ const mockHost = {
        finished: '2020-02-26T22:38:41.037991Z',
      },
    ],
+    groups: {
+      count: 1,
+      results: [
+        {
+          id: 11,
+          name: 'group_11',
+        },
+      ],
+    },
  },
 };

 describe('<InventoryHostItem />', () => {
-  let wrapper;
+  const history = createMemoryHistory({
+    initialEntries: ['/inventories/inventory/1/hosts'],
+  });

-  beforeEach(() => {
-    wrapper = mountWithContexts(
+  const getChips = (currentScreen) => {
+    const list = currentScreen.getByRole('list', {
+      name: 'Related Groups',
+    });
+    const { getAllByRole } = within(list);
+    const items = getAllByRole('listitem');
+    return items.map((item) => item.textContent);
+  };
+
+  const Component = (props) => (
+    <Router history={history}>
      <table>
        <tbody>
          <InventoryHostItem
-            isSelected={false}
            detailUrl="/host/1"
-            onSelect={() => {}}
+            editUrl={`/inventories/inventory/1/hosts/1/edit`}
            host={mockHost}
+            isSelected={false}
+            onSelect={() => {}}
+            {...props}
          />
        </tbody>
      </table>
-    );
+    </Router>
+  );
+
+  beforeEach(() => {
+    i18n.loadLocaleData({ en: { plurals: en } });
+    i18n.load({ en: english });
+    i18n.activate('en');
  });

  test('should display expected details', () => {
-    expect(wrapper.find('InventoryHostItem').length).toBe(1);
-    expect(wrapper.find('Td[dataLabel="Name"]').find('Link').prop('to')).toBe(
+    render(<Component />);
+
+    expect(screen.getByRole('cell', { name: 'Bar' })).toBeInTheDocument();
+    expect(
+      screen.getByRole('checkbox', { name: 'Toggle host' })
+    ).toBeInTheDocument();
+    expect(screen.getByRole('link', { name: 'Host 1' })).toHaveAttribute(
+      'href',
      '/host/1'
    );
-    expect(wrapper.find('Td[dataLabel="Description"]').text()).toBe('Bar');
-  });
+    expect(screen.getByRole('link', { name: 'Edit host' })).toHaveAttribute(
+      'href',
+      '/inventories/inventory/1/hosts/1/edit'
+    );

-  test('edit button shown to users with edit capabilities', () => {
-    expect(wrapper.find('PencilAltIcon').exists()).toBeTruthy();
+    const relatedGroupChips = getChips(screen);
+    expect(relatedGroupChips).toEqual(['group_11']);
  });

  test('edit button hidden from users without edit capabilities', () => {
    const copyMockHost = { ...mockHost };
    copyMockHost.summary_fields.user_capabilities.edit = false;
-    wrapper = mountWithContexts(
-      <table>
-        <tbody>
-          <InventoryHostItem
-            isSelected={false}
-            detailUrl="/host/1"
-            onSelect={() => {}}
-            host={copyMockHost}
-          />
-        </tbody>
-      </table>
-    );
-    expect(wrapper.find('PencilAltIcon').exists()).toBeFalsy();
+
+    render(<Component host={copyMockHost} />);
+    expect(screen.queryByText('Edit host')).toBeNull();
  });

-  test('should display host toggle', () => {
-    expect(wrapper.find('HostToggle').length).toBe(1);
+  test('should show and hide related groups on overflow button click', async () => {
+    const copyMockHost = { ...mockHost };
+    const mockGroups = [
+      {
+        id: 1,
+        name: 'group_1',
+      },
+      {
+        id: 2,
+        name: 'group_2',
+      },
+      {
+        id: 3,
+        name: 'group_3',
+      },
+      {
+        id: 4,
+        name: 'group_4',
+      },
+      {
+        id: 5,
+        name: 'group_5',
+      },
+      {
+        id: 6,
+        name: 'group_6',
+      },
+    ];
+    copyMockHost.summary_fields.groups = {
+      count: 6,
+      results: mockGroups.slice(0, 5),
+    };
+    HostsAPI.readGroups.mockReturnValue({
+      data: {
+        results: mockGroups,
+      },
+    });
+
+    render(<Component host={copyMockHost} />);
+
+    const initialRelatedGroupChips = getChips(screen);
+    expect(initialRelatedGroupChips).toEqual([
+      'group_1',
+      'group_2',
+      'group_3',
+      'group_4',
+      '2 more',
+    ]);
+
+    const overflowGroupsButton = screen.queryByText('2 more');
+    fireEvent.click(overflowGroupsButton);
+
+    await waitFor(() => expect(HostsAPI.readGroups).toHaveBeenCalledWith(1));
+
+    const expandedRelatedGroupChips = getChips(screen);
+    expect(expandedRelatedGroupChips).toEqual([
+      'group_1',
+      'group_2',
+      'group_3',
+      'group_4',
+      'group_5',
+      'group_6',
+      'Show less',
+    ]);
+
+    const collapseGroupsButton = await screen.findByText('Show less');
+    fireEvent.click(collapseGroupsButton);
+
+    const collapsedRelatedGroupChips = getChips(screen);
+    expect(collapsedRelatedGroupChips).toEqual(initialRelatedGroupChips);
+  });
+
+  test('should show error modal when related groups api request fails', async () => {
+    const copyMockHost = { ...mockHost };
+    const mockGroups = [
+      {
+        id: 1,
+        name: 'group_1',
+      },
+      {
+        id: 2,
+        name: 'group_2',
+      },
+      {
+        id: 3,
+        name: 'group_3',
+      },
+      {
+        id: 4,
+        name: 'group_4',
+      },
+      {
+        id: 5,
+        name: 'group_5',
+      },
+      {
+        id: 6,
+        name: 'group_6',
+      },
+    ];
+    copyMockHost.summary_fields.groups = {
+      count: 6,
+      results: mockGroups.slice(0, 5),
+    };
+    HostsAPI.readGroups.mockRejectedValueOnce(new Error());
+
+    render(<Component host={copyMockHost} />);
+    await waitFor(() => {
+      const overflowGroupsButton = screen.queryByText('2 more');
+      fireEvent.click(overflowGroupsButton);
+    });
+    expect(screen.getByRole('dialog', { name: 'Alert modal Error!' }));
  });
 });
--- a/awx/ui/src/screens/Inventory/InventoryHosts/InventoryHostList.js
+++ b/awx/ui/src/screens/Inventory/InventoryHosts/InventoryHostList.js
@@ -137,6 +137,7 @@ function InventoryHostList() {
          <HeaderRow qsConfig={QS_CONFIG}>
            <HeaderCell sortKey="name">{t`Name`}</HeaderCell>
            <HeaderCell sortKey="description">{t`Description`}</HeaderCell>
+            <HeaderCell>{t`Related Groups`}</HeaderCell>
            <HeaderCell>{t`Actions`}</HeaderCell>
          </HeaderRow>
        }
--- a/awxkit/awxkit/cli/docs/source/install.rst
+++ b/awxkit/awxkit/cli/docs/source/install.rst
@@ -2,6 +2,6 @@ The preferred way to install the AWX CLI is through pip:

 .. code:: bash

-    pip install "https://github.com/ansible/awx/archive/$VERSION.tar.gz#egg=awxkit&subdirectory=awxkit"
+    pip install "git+https://github.com/ansible/awx.git@$VERSION#egg=awxkit&subdirectory=awxkit"

 ...where ``$VERSION`` is the version of AWX you're running.  To see a list of all available releases, visit: https://github.com/ansible/awx/releases
--- a/docs/capacity.md
+++ b/docs/capacity.md
@@ -98,6 +98,7 @@ Examples:
 Given settings.AWX_CONTROL_NODE_TASK_IMPACT is 1:
  - Project updates (where the execution_node is always the same as the controller_node), have a total impact of 2.
  - Container group jobs (where the execution node is not a member of the cluster) only control impact applies, and the controller node has a total task impact of 1.
+  - A job executing on a "hybrid" node where both control and execution will occur on the same node has the task impact of (1 overhead for ansible main process) + (min(forks,hosts)) + (1 control node task impact). Meaning a Job running on a hybrid node with forks set to 1 would have a total task impact of 3.

 ### Selecting the Right settings.AWX_CONTROL_NODE_TASK_IMPACT

--- a/docs/task_manager_system.md
+++ b/docs/task_manager_system.md
@@ -1,18 +1,108 @@
-# Task Manager Overview
+# Task Manager System Overview

-The task manager is responsible for deciding when jobs should be scheduled to run. When choosing a task to run, the considerations are:
+The task management system is made up of three separate components:
+1. Dependency Manager
+2. Task Manager
+3. Workflow Manager
+
+Each of these run in a separate dispatched task and can run at the same time as one another.
+
+This system is responsible for deciding when tasks should be scheduled to run. When choosing a task to run, the considerations are:
 1. Creation time
 2. Job dependencies
 3. Capacity

-Independent jobs are run in order of creation time, earliest first. Jobs with dependencies are also run in creation time order within the group of job dependencies. Capacity is the final consideration when deciding to release a job to be run by the task dispatcher.
+Independent tasks are run in order of creation time, earliest first. Tasks with dependencies are also run in creation time order within the group of task dependencies. Capacity is the final consideration when deciding to release a task to be run by the dispatcher.

-## Task Manager Architecture

-The task manager has a single entry point, `Scheduler().schedule()`. The method may be called in parallel, at any time, as many times as the user wants. The `schedule()` function tries to acquire a single, global lock using the Instance table first recorded in the database. If the lock cannot be acquired, the method returns. The failure to acquire the lock indicates that there is another instance currently running `schedule()`.
+## Dependency Manager
+
+Responsible for looking at each pending task and determining whether it should create a dependency for that task.
+For example, if `update_on_launch` is enabled of a task, a project update will be created as a dependency of that task. The Dependency Manager is responsible for creating that project update.
+
+Dependencies can also have their own dependencies, for example,
+
+```
+-----------+
+|           |                          created by web API call
+|   Job A   |
+|           |
+-----------+---+
+                |
+                |
+        +-------v----+
+        |  Inventory |                 dependency of Job A
+        |  Source    |                 created by Dependency Manager
+        |  Update B  |
+        +------------+-------+
+                             |
+                             |
+                      +------v------+
+                      |   Project   |  dependency of Inventory Source Update B
+                      |   Update C  |  created by Dependency Manager
+                      +-------------+
+```
+
+
+### Dependency Manager Steps
+
+1. Get pending tasks (parent tasks) that have `dependencies_processed = False`
+2. Create project update if
+    a. not already created
+    b. last project update outside of cache timeout window
+3. Create inventory source update if
+    a. not already created
+    b. last inventory source update outside of cache timeout window
+4. Check and create dependencies for these newly created dependencies
+    a. inventory source updates can have a project update dependency
+5. All dependencies are linked to the parent task via the `dependent_jobs` field
+    a. This allows us to cancel the parent task if the dependency fails or is canceled
+6. Update the parent tasks with `dependencies_processed = True`
+
+
+## Task Manager
+
+Responsible for looking at each pending task and determining whether Task Manager can start that task.
+
+### Task Manager Steps
+
+1. Get pending, waiting, and running tasks that have `dependencies_processed = True`
+2. Before processing pending tasks, the task manager first processes running tasks. This allows it to build a dependency graph and account for the currently consumed capacity in the system.
+    a. dependency graph is just an internal data structure that tracks which jobs are currently running. It also handles "soft" blocking logic
+    b. the capacity is tracked in memory on the `TaskManagerInstances` and `TaskManagerInstanceGroups` objects which are  in-memory representations of the instances and instance groups. These data structures are used to help track what consumed capacity will be as we decide that we will start new tasks, and until such time that we actually commit the state changes to the database.
+3. For each pending task:
+    a. Check if total number of tasks started on this task manager cycle is > `start_task_limit`
+    b. Check if [timed out](#Timing Out)
+    b. Check if task is blocked
+    c. Check if preferred instances have enough capacity to run the task
+4. Start the task by changing status to `waiting` and submitting task to dispatcher
+
+
+## Workflow Manager
+
+Responsible for looking at each workflow job and determining if next node can run
+
+### Worflow Manager Steps
+
+1. Get all running workflow jobs
+2. Build up a workflow DAG for each workflow job
+3. For each workflow job:
+    a. Check if [timed out](#Timing Out)
+    b. Check if next node can start based on previous node status and the associated success / failure / always logic
+4. Create new task and signal start
+
+
+## Task Manager System Architecture
+
+Each of the three managers has a single entry point, `schedule()`. The `schedule()` function tries to acquire a single, global lock recorded in the database. If the lock cannot be acquired, the method returns. The failure to acquire the lock indicates that there is another instance currently running `schedule()`.
+
+Each manager runs inside of an atomic DB transaction. If the dispatcher task that is running the manager is killed, none of the created tasks or updates will take effect.

 ### Hybrid Scheduler: Periodic + Event
-The `schedule()` function is run (a) periodically by a background task and (b) on job creation or completion. The task manager system would behave correctly if it ran, exclusively, via (a) or (b).
+
+Each manager's `schedule()` function is run (a) periodically by a background task and (b) on job creation or completion. The task manager system would behave correctly if it ran, exclusively, via (a) or (b).
+
+Special note -- the workflow manager is not scheduled to run periodically *directly*, but piggy-backs off the task manager. That is, if task manager sees at least one running workflow job, it will schedule the workflow manager to run.

 `schedule()` is triggered via both mechanisms because of the following properties:
 1. It reduces the time from launch to running, resulting a better user experience.
@@ -20,21 +110,34 @@ The `schedule()` function is run (a) periodically by a background task and (b) o

 Empirically, the periodic task manager has been effective in the past and will continue to be relied upon with the added event-triggered `schedule()`.

-### Scheduler Algorithm
+### Bulk Reschedule 

- * Get all non-completed jobs, `all_tasks`
- * Detect finished workflow jobs
- * Spawn next workflow jobs if needed
- * For each pending job, start with the oldest created job
-   * If the job is not blocked, and there is capacity in the instance group queue, then mark it as `waiting` and submit the job.
+Typically each manager is ran asynchronously via the dispatcher system. Dispatcher tasks take resources, so it is important to not schedule tasks unnecessarily. We also need a mechanism to run the manager *after* an atomic transaction block.
+
+Scheduling the managers are facilitated through the `ScheduleTaskManager`, `ScheduleDependencyManager`, and `ScheduleWorkflowManager` classes. These are utilities that help prevent too many managers from being started via the dispatcher system. Think of it as a "do once" mechanism.
+
+```python3
+with transaction.atomic()
+  for t in tasks:
+    if condition:
+      ScheduleTaskManager.schedule()
+```
+
+In the above code, we only want to schedule the TaskManager once after all `tasks` have been processed. `ScheduleTaskManager.schedule()` will handle that logic correctly.
+
+### Timing out
+
+Because of the global lock of the each manager, only one manager can run at a time. If that manager gets stuck for whatever reason, it is important to kill it and let a new one take its place. As such, there is special code in the parent dispatcher process to SIGKILL any of the task system managers after a few minutes.
+
+There is an important side effect to this. Because the manager `schedule()` runs in a transaction, the next run will have re-process the same tasks again. This could lead a manager never being able to progress from one run to the next, as each time it times out. In this situation the task system is effectively stuck as new tasks cannot start. To mitigate this, each manager will check if is is about to hit the time out period and bail out early if so. This gives the manager enough time to commit the DB transaction, and the next manager cycle will be able to start with the next set of unprocessed tasks. This ensures that the system can still make incremental progress under high workloads (i.e. many pending tasks).


 ### Job Lifecycle

 | Job Status |                                                       State                                                      |
-|:----------:|:------------------------------------------------------------------------------------------------------------------:|
+|:-----------|:-------------------------------------------------------------------------------------------------------------------|
 | pending    | Job has been launched.  <br>1. Hasn't yet been seen by the scheduler <br>2. Is blocked by another task <br>3. Not enough capacity |
-| waiting    | Job published to an AMQP queue.
+| waiting    | Job submitted to dispatcher via pg_notify
 | running    | Job is running on a AWX node.
 | successful | Job finished with `ansible-playbook` return code 0.                                                                  |
 | failed     | Job finished with `ansible-playbook` return code other than 0.                                                       |
@@ -46,19 +149,20 @@ Empirically, the periodic task manager has been effective in the past and will c
 The Task Manager decides which exact node a job will run on. It does so by considering user-configured group execution policy and user-configured capacity. First, the set of groups on which a job _can_ run on is constructed (see the AWX document on [Clustering](https://github.com/ansible/awx/blob/devel/docs/clustering.md)). The groups are traversed until a node within that group is found. The node with the largest remaining capacity that is idle is chosen first. If there are no idle nodes, then the node with the largest remaining capacity greater than or equal to the job capacity requirements is chosen.


-## Code Composition
+## Managers are short-lived

-The main goal of the new task manager is to run in our HA environment. This translates to making the task manager logic run on any AWX node. To support this, we need to remove any reliance on the state between task manager schedule logic runs. A future goal of AWX is to design the task manager to have limited/no access to the database for this feature. This secondary requirement, combined with performance needs, led to the creation of partial models that wrap dict database model data.
+Manager instances are short lived. Each time it runs, a new instance of the manager class is created, relevant data is pulled in from database, and the manager processes the data. After running, the instance is cleaned up.


 ### Blocking Logic

 The blocking logic is handled by a mixture of ORM instance references and task manager local tracking data in the scheduler instance.

+There is a distinction between so-called "hard" vs "soft" blocking.

-## Acceptance Tests
+**Hard blocking** refers to dependencies that are represented in the database via the task `dependent_jobs` field. That is, Job A will not run if any of its `dependent_jobs` are still running.

-The new task manager should, in essence, work like the old one. Old task manager features were identified while new ones were discovered in the process of creating the new task manager. Rules for the new task manager behavior are iterated below; testing should ensure that those rules are followed.
+**Soft blocking** refers to blocking logic that doesn't have a database representation. Imagine Job A and B are both based on the same job template, and concurrent jobs is `disabled`. Job B will be blocked from running if Job A is already running. This is determined purely by the task manager tracking running jobs via the Dependency Graph.


 ### Task Manager Rules
--- a/tools/ansible/roles/dockerfile/templates/Dockerfile.j2
+++ b/tools/ansible/roles/dockerfile/templates/Dockerfile.j2
@@ -274,7 +274,7 @@ RUN for dir in \
      /var/run/nginx.pid \
      /var/lib/awx/venv/awx/lib/python3.9/site-packages/awx.egg-link ; \
    do touch $file ; chmod g+rw $file ; done && \
-    echo "\setenv PAGER 'less -S'" > /var/lib/awx/.psqlrc
+    echo "\setenv PAGER 'less -SXF'" > /var/lib/awx/.psqlrc
 {% endif %}

 {% if not build_dev|bool %}
Author	SHA1	Message	Date
Seth Foster	da9b8135e8	Merge pull request #13315 from fosterseth/update_task_manager_md update task manager docs after refactoring	2022-12-12 12:42:49 -05:00
Elijah DeLee	76cecf3f6b	update capacity docs to cover hybrid node case this came up in conversation and I saw this was not in this doc as an example	2022-12-12 12:11:56 -05:00
Seth Foster	916b5642d2	Update task manager docs - DependencyManager and WorkflowManager - bulk reschedule - global task manager timeout - blocking logic Co-authored-by: Elijah DeLee <kdelee@redhat.com> Co-authored-by: John R Barker <john@johnrbarker.com>	2022-12-12 11:56:40 -05:00
Rick Elrod	01e9a611ea	Add broadcast_websocket to LOG_AGGREGATOR_LOGGERS ... so that errors from it get logged to external loggers by default. Signed-off-by: Rick Elrod <rick@elrod.me>	2022-12-08 17:50:20 -06:00
Rick Elrod	ef29589940	Fix duped stats name and Redis for wsbroadcast This fixes several things related to our wsbroadcast stats handling. This was found during the ongoing wsrelay work. There are really three fixes here: - Logging was not actually enabled for the analytics.broadcast_websocket module, so that has been added to our loggers config. - analytics.broadcast_websocket was not actually able to connect to Redis due to `68614b83c0` as part of the work in #13187. But there was no easy way to know this because the logging issue meant no exceptions showed up anywhere reasonable. - Relatedly, and also as part of #13187, we jumped from `prometheus-client` 0.7.1 up to 0.15.0. This included a breaking change where a `Counter` ending with `_total` will clash with a `Gauge` of the same name but without `_total`. I am not 100% sure of the reasoning here, other than "OpenMetrics compatibility". Refs #13301 Refs #13187 Signed-off-by: Rick Elrod <rick@elrod.me>	2022-12-08 12:54:08 -06:00
Alan Rominger	a0b8215c06	Merge pull request #13296 from AlanCoding/signing_bug Fix bug, sign work based signing, not verification	2022-12-07 08:44:57 -05:00
Alan Rominger	f88b993b18	Fix bug, sign work based signing, not verification	2022-12-06 16:21:17 -05:00
Jeff Bradberry	adbcb5c5e4	Merge pull request #13289 from jbradberry/improve-psql-paging Make sure that the psql pager does not clear the screen afterwards	2022-12-06 13:17:24 -05:00
Jeff Bradberry	8054c6aedc	Make sure that the psql pager does not clear the screen afterwards Also, avoid paging if there is a single page.	2022-12-06 10:46:47 -05:00
Seth Foster	e5057691ee	Merge pull request #13252 from max-len/patch-1 Update install.rst	2022-12-02 22:46:26 -05:00
Shane McDonald	a0cfd8501c	Merge pull request #13274 from rooftopcellist/fix-messages-cmd Fix make messages target by specify lang	2022-12-02 19:04:09 -05:00
Shane McDonald	99b643bd77	Merge pull request #13268 from simaishi/fix_static Copy UI static files to /var/lib/awx only for ui-devel build	2022-12-02 19:03:48 -05:00
Sarah Akus	305b39d8e5	Merge pull request #13209 from marshmalien/5990-related-group-column Add inventory host list related groups column	2022-12-02 16:23:09 -05:00
Christian M. Adams	642003e207	Fix make messages target by specify lang	2022-12-02 10:46:16 -05:00
Satoe Imaishi	06daebbecf	Copy UI static files to /var/lib/awx only for ui-devel build	2022-12-01 08:58:05 -05:00
Max Lendrich	eaccf32aa3	Update install.rst Fix doc for current pip==22.3	2022-11-30 16:54:42 +01:00
Marliana Lara	f0481d0a60	Add inventory host list related groups column	2022-11-21 12:04:40 -05:00