Merge pull request #12778 from keithjgrant/12542-schedule-exceptions

Schedule exceptions
adjust DetailList spacing when two appear in succession
2026-02-10 06:04:42 -03:30 · 2022-09-15 16:28:56 -04:00 · 2022-09-15 09:37:03 -07:00 · 2022-09-14 15:33:00 -07:00 · 2022-09-14 16:23:53 -04:00 · 2022-09-14 16:23:53 -04:00
253 changed files with 9328 additions and 3743 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,3 +1,2 @@
-awx/ui/node_modules
 Dockerfile
 .git
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -25,7 +25,7 @@ Instead use the bug or feature request.
 <!--- Pick one below and delete the rest: -->
 - Breaking Change
 - New or Enhanced Feature
- - Bug or Docs Fix
+ - Bug, Docs Fix or other nominal change


 ##### COMPONENT NAME
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -20,6 +20,19 @@ body:
        - label: I understand that AWX is open source software provided for free and that I might not receive a timely response.
          required: true

+  - type: dropdown
+    id: feature-type
+    attributes:
+      label: Feature type
+      description: >-
+        What kind of feature is this?
+      multiple: false
+      options:
+        - "New Feature"
+        - "Enhancement to Existing Feature"
+    validations:
+      required: true
+
  - type: textarea
    id: summary
    attributes:
@@ -40,3 +53,36 @@ body:
        - label: CLI
        - label: Other

+  - type: textarea
+    id: steps-to-reproduce
+    attributes:
+      label: Steps to reproduce
+      description: >-
+        Describe the necessary steps to understand the scenario of the requested enhancement. 
+        Include all the steps that will help the developer and QE team understand what you are requesting.
+    validations:
+      required: true
+
+  - type: textarea
+    id: current-results
+    attributes:
+      label: Current results
+      description: What is currently happening on the scenario?
+    validations:
+      required: true
+
+  - type: textarea
+    id: sugested-results
+    attributes:
+      label: Sugested feature result
+      description: What is the result this new feature will bring?
+    validations:
+      required: true
+
+  - type: textarea
+    id: additional-information
+    attributes:
+      label: Additional information
+      description: Please provide any other information you think is relevant that could help us understand your feature request.
+    validations:
+      required: false
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -11,7 +11,7 @@ the change does.
 <!--- Pick one below and delete the rest: -->
 - Breaking Change 
 - New or Enhanced Feature
- - Bug or Docs Fix
+ - Bug, Docs Fix or other nominal change

 ##### COMPONENT NAME
 <!--- Name of the module/plugin/module/task -->
--- a/.github/triage_replies.md
+++ b/.github/triage_replies.md
@@ -93,6 +93,9 @@ The Ansible Community is looking at building an EE that corresponds to all of th
 - AWX: https://github.com/ansible/awx/blob/devel/CONTRIBUTING.md
 - AWX-Operator: https://github.com/ansible/awx-operator/blob/devel/CONTRIBUTING.md

+### Oracle AWX
+We'd be happy to help if you can reproduce this with AWX since we do not have Oracle's Linux Automation Manager. If you need help with this specific version of Oracles Linux Automation Manager you will need to contact your Oracle for support. 
+
 ### AWX Release
 Subject: Announcing AWX Xa.Ya.za and AWX-Operator Xb.Yb.zb

--- a/.github/workflows/label_issue.yml
+++ b/.github/workflows/label_issue.yml
@@ -19,3 +19,34 @@ jobs:
          not-before: 2021-12-07T07:00:00Z
          configuration-path: .github/issue_labeler.yml
          enable-versioned-regex: 0
+
+  community:
+    runs-on: ubuntu-latest
+    name: Label Issue - Community
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v4
+      - name: Install python requests
+        run: pip install requests
+      - name: Check if user is a member of Ansible org
+        uses: jannekem/run-python-script-action@v1
+        id: check_user
+        with:
+          script: |
+            import requests
+            headers = {'Accept': 'application/vnd.github+json', 'Authorization': 'token ${{ secrets.GITHUB_TOKEN }}'}
+            response = requests.get('${{ fromJson(toJson(github.event.issue.user.url)) }}/orgs?per_page=100', headers=headers)
+            is_member = False
+            for org in response.json():
+              if org['login'] == 'ansible':
+                is_member = True
+            if is_member:
+                print("User is member")
+            else:
+                print("User is community")
+      - name: Add community label if not a member
+        if: contains(steps.check_user.outputs.stdout, 'community')
+        uses: andymckay/labeler@e6c4322d0397f3240f0e7e30a33b5c5df2d39e90
+        with:
+          add-labels: "community"
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/label_pr.yml
+++ b/.github/workflows/label_pr.yml
@@ -18,3 +18,34 @@ jobs:
        with:
          repo-token: "${{ secrets.GITHUB_TOKEN }}"
          configuration-path: .github/pr_labeler.yml
+
+  community:
+    runs-on: ubuntu-latest
+    name: Label PR - Community
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v4
+      - name: Install python requests
+        run: pip install requests
+      - name: Check if user is a member of Ansible org
+        uses: jannekem/run-python-script-action@v1
+        id: check_user
+        with:
+          script: |
+            import requests
+            headers = {'Accept': 'application/vnd.github+json', 'Authorization': 'token ${{ secrets.GITHUB_TOKEN }}'}
+            response = requests.get('${{ fromJson(toJson(github.event.pull_request.user.url)) }}/orgs?per_page=100', headers=headers)
+            is_member = False
+            for org in response.json():
+              if org['login'] == 'ansible':
+                is_member = True
+            if is_member:
+                print("User is member")
+            else:
+                print("User is community")
+      - name: Add community label if not a member
+        if: contains(steps.check_user.outputs.stdout, 'community')
+        uses: andymckay/labeler@e6c4322d0397f3240f0e7e30a33b5c5df2d39e90
+        with:
+          add-labels: "community"
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/pr_body_check.yml
+++ b/.github/workflows/pr_body_check.yml
@@ -0,0 +1,45 @@
+---
+name: PR Check
+env:
+  BRANCH: ${{ github.base_ref || 'devel' }}
+on:
+  pull_request:
+    types: [opened, edited, reopened, synchronize]
+jobs:
+  pr-check:
+    name: Scan PR description for semantic versioning keywords
+    runs-on: ubuntu-latest
+    permissions:
+      packages: write
+      contents: read
+    steps:
+      - name: Write PR body to a file
+        run: |
+          cat >> pr.body << __SOME_RANDOM_PR_EOF__
+          ${{ github.event.pull_request.body }}
+          __SOME_RANDOM_PR_EOF__
+
+      - name: Display the received body for troubleshooting
+        run: cat pr.body
+
+      # We want to write these out individually just incase the options were joined on a single line
+      - name: Check for each of the lines
+        run: |
+          grep "Bug, Docs Fix or other nominal change" pr.body > Z
+          grep "New or Enhanced Feature" pr.body > Y
+          grep "Breaking Change" pr.body > X
+          exit 0
+        # We exit 0 and set the shell to prevent the returns from the greps from failing this step
+        # See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#exit-codes-and-error-action-preference
+        shell: bash {0}
+
+      - name: Check for exactly one item
+        run: |
+          if [ $(cat X Y Z | wc -l) != 1 ] ; then
+            echo "The PR body must contain exactly one of [ 'Bug, Docs Fix or other nominal change', 'New or Enhanced Feature', 'Breaking Change' ]"
+            echo "We counted $(cat X Y Z | wc -l)"
+            echo "See the default PR body for examples"
+            exit 255;
+          else
+            exit 0;
+          fi
--- a/.github/workflows/update_dependabot_prs.yml
+++ b/.github/workflows/update_dependabot_prs.yml
@@ -0,0 +1,26 @@
+---
+name: Dependency Pr Update
+on:
+  pull_request:
+    types: [labeled, opened, reopened]
+
+jobs:
+  pr-check:
+    name: Update Dependabot Prs
+    if:  contains(github.event.pull_request.labels.*.name, 'dependencies') && contains(github.event.pull_request.labels.*.name, 'component:ui')
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout branch
+        uses: actions/checkout@v3
+
+      - name: Update PR Body
+        env:
+            GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+            OWNER: ${{ github.repository_owner }}
+            REPO: ${{ github.event.repository.name }}
+            BRANCH: ${{github.event.pull_request.head.ref}}
+            PR: ${{github.event.pull_request}}
+        run: |
+          gh pr checkout ${{ env.BRANCH }}
+          gh pr edit --body "${{ env.PR }}\nBug, Docs Fix or other nominal change"
--- a/.gitignore
+++ b/.gitignore
@@ -153,9 +153,6 @@ use_dev_supervisor.txt
 /sanity/
 /awx_collection_build/

-# Setup for metrics gathering
-tools/prometheus/prometheus.yml
-
 .idea/*
 *.unison.tmp
 *.#
--- a/97
+++ b/97
@@ -72,7 +72,7 @@ clean-languages:
 	rm -f $(I18N_FLAG_FILE)
 	find ./awx/locale/ -type f -regex ".*\.mo$" -delete

-# Remove temporary build files, compiled Python files.
+## Remove temporary build files, compiled Python files.
 clean: clean-ui clean-api clean-awxkit clean-dist
 	rm -rf awx/public
 	rm -rf awx/lib/site-packages
@@ -94,7 +94,7 @@ clean-api:
 clean-awxkit:
 	rm -rf awxkit/*.egg-info awxkit/.tox awxkit/build/*

-# convenience target to assert environment variables are defined
+## convenience target to assert environment variables are defined
 guard-%:
 	@if [ "$${$*}" = "" ]; then \
 	    echo "The required environment variable '$*' is not set"; \
@@ -117,7 +117,7 @@ virtualenv_awx:
 		fi; \
 	fi

-# Install third-party requirements needed for AWX's environment.
+## Install third-party requirements needed for AWX's environment. 
 # this does not use system site packages intentionally
 requirements_awx: virtualenv_awx
 	if [[ "$(PIP_OPTIONS)" == *"--no-index"* ]]; then \
@@ -136,7 +136,7 @@ requirements_dev: requirements_awx requirements_awx_dev

 requirements_test: requirements

-# "Install" awx package in development mode.
+## "Install" awx package in development mode.
 develop:
 	@if [ "$(VIRTUAL_ENV)" ]; then \
 	    pip uninstall -y awx; \
@@ -153,21 +153,21 @@ version_file:
 	fi; \
 	$(PYTHON) -c "import awx; print(awx.__version__)" > /var/lib/awx/.awx_version; \

-# Refresh development environment after pulling new code.
+## Refresh development environment after pulling new code.
 refresh: clean requirements_dev version_file develop migrate

-# Create Django superuser.
+## Create Django superuser.
 adduser:
 	$(MANAGEMENT_COMMAND) createsuperuser

-# Create database tables and apply any new migrations.
+## Create database tables and apply any new migrations.
 migrate:
 	if [ "$(VENV_BASE)" ]; then \
 		. $(VENV_BASE)/awx/bin/activate; \
 	fi; \
 	$(MANAGEMENT_COMMAND) migrate --noinput

-# Run after making changes to the models to create a new migration.
+## Run after making changes to the models to create a new migration.
 dbchange:
 	$(MANAGEMENT_COMMAND) makemigrations

@@ -218,7 +218,7 @@ wsbroadcast:
 	fi; \
 	$(PYTHON) manage.py run_wsbroadcast

-# Run to start the background task dispatcher for development.
+## Run to start the background task dispatcher for development.
 dispatcher:
 	@if [ "$(VENV_BASE)" ]; then \
 		. $(VENV_BASE)/awx/bin/activate; \
@@ -226,7 +226,7 @@ dispatcher:
 	$(PYTHON) manage.py run_dispatcher


-# Run to start the zeromq callback receiver
+## Run to start the zeromq callback receiver
 receiver:
 	@if [ "$(VENV_BASE)" ]; then \
 		. $(VENV_BASE)/awx/bin/activate; \
@@ -278,7 +278,7 @@ awx-link:

 TEST_DIRS ?= awx/main/tests/unit awx/main/tests/functional awx/conf/tests awx/sso/tests
 PYTEST_ARGS ?= -n auto
-# Run all API unit tests.
+## Run all API unit tests.
 test:
 	if [ "$(VENV_BASE)" ]; then \
 		. $(VENV_BASE)/awx/bin/activate; \
@@ -341,23 +341,24 @@ test_unit:
 	fi; \
 	py.test awx/main/tests/unit awx/conf/tests/unit awx/sso/tests/unit

-# Run all API unit tests with coverage enabled.
+## Run all API unit tests with coverage enabled.
 test_coverage:
 	@if [ "$(VENV_BASE)" ]; then \
 		. $(VENV_BASE)/awx/bin/activate; \
 	fi; \
 	py.test --create-db --cov=awx --cov-report=xml --junitxml=./reports/junit.xml $(TEST_DIRS)

-# Output test coverage as HTML (into htmlcov directory).
+## Output test coverage as HTML (into htmlcov directory).
 coverage_html:
 	coverage html

-# Run API unit tests across multiple Python/Django versions with Tox.
+## Run API unit tests across multiple Python/Django versions with Tox.
 test_tox:
 	tox -v

-# Make fake data
+
 DATA_GEN_PRESET = ""
+## Make fake data
 bulk_data:
 	@if [ "$(VENV_BASE)" ]; then \
 		. $(VENV_BASE)/awx/bin/activate; \
@@ -380,7 +381,8 @@ clean-ui:
 awx/ui/node_modules:
 	NODE_OPTIONS=--max-old-space-size=6144 $(NPM_BIN) --prefix awx/ui --loglevel warn ci

-$(UI_BUILD_FLAG_FILE): awx/ui/node_modules
+$(UI_BUILD_FLAG_FILE):
+	$(MAKE) awx/ui/node_modules
 	$(PYTHON) tools/scripts/compilemessages.py
 	$(NPM_BIN) --prefix awx/ui --loglevel warn run compile-strings
 	$(NPM_BIN) --prefix awx/ui --loglevel warn run build
@@ -451,6 +453,11 @@ COMPOSE_OPTS ?=
 CONTROL_PLANE_NODE_COUNT ?= 1
 EXECUTION_NODE_COUNT ?= 2
 MINIKUBE_CONTAINER_GROUP ?= false
+EXTRA_SOURCES_ANSIBLE_OPTS ?=
+
+ifneq ($(ADMIN_PASSWORD),)
+	EXTRA_SOURCES_ANSIBLE_OPTS := -e admin_password=$(ADMIN_PASSWORD) $(EXTRA_SOURCES_ANSIBLE_OPTS)
+endif

 docker-compose-sources: .git/hooks/pre-commit
 	@if [ $(MINIKUBE_CONTAINER_GROUP) = true ]; then\
@@ -468,7 +475,8 @@ docker-compose-sources: .git/hooks/pre-commit
 	    -e enable_ldap=$(LDAP) \
 	    -e enable_splunk=$(SPLUNK) \
 	    -e enable_prometheus=$(PROMETHEUS) \
-	    -e enable_grafana=$(GRAFANA)
+	    -e enable_grafana=$(GRAFANA) $(EXTRA_SOURCES_ANSIBLE_OPTS)
+


 docker-compose: awx/projects docker-compose-sources
@@ -502,7 +510,7 @@ docker-compose-container-group-clean:
 	fi
 	rm -rf tools/docker-compose-minikube/_sources/

-# Base development image build
+## Base development image build
 docker-compose-build:
 	ansible-playbook tools/ansible/dockerfile.yml -e build_dev=True -e receptor_image=$(RECEPTOR_IMAGE)
 	DOCKER_BUILDKIT=1 docker build -t $(DEVEL_IMAGE_NAME) \
@@ -520,7 +528,7 @@ docker-clean-volumes: docker-compose-clean docker-compose-container-group-clean

 docker-refresh: docker-clean docker-compose

-# Docker Development Environment with Elastic Stack Connected
+## Docker Development Environment with Elastic Stack Connected
 docker-compose-elk: awx/projects docker-compose-sources
 	docker-compose -f tools/docker-compose/_sources/docker-compose.yml -f tools/elastic/docker-compose.logstash-link.yml -f tools/elastic/docker-compose.elastic-override.yml up --no-recreate

@@ -557,26 +565,34 @@ Dockerfile.kube-dev: tools/ansible/roles/dockerfile/templates/Dockerfile.j2
 	    -e template_dest=_build_kube_dev \
 	    -e receptor_image=$(RECEPTOR_IMAGE)

+## Build awx_kube_devel image for development on local Kubernetes environment.
 awx-kube-dev-build: Dockerfile.kube-dev
 	DOCKER_BUILDKIT=1 docker build -f Dockerfile.kube-dev \
 	    --build-arg BUILDKIT_INLINE_CACHE=1 \
 	    --cache-from=$(DEV_DOCKER_TAG_BASE)/awx_kube_devel:$(COMPOSE_TAG) \
 	    -t $(DEV_DOCKER_TAG_BASE)/awx_kube_devel:$(COMPOSE_TAG) .

+## Build awx image for deployment on Kubernetes environment.
+awx-kube-build: Dockerfile
+	DOCKER_BUILDKIT=1 docker build -f Dockerfile \
+		--build-arg VERSION=$(VERSION) \
+		--build-arg SETUPTOOLS_SCM_PRETEND_VERSION=$(VERSION) \
+		--build-arg HEADLESS=$(HEADLESS) \
+		-t $(DEV_DOCKER_TAG_BASE)/awx:$(COMPOSE_TAG) .

 # Translation TASKS
 # --------------------------------------

-# generate UI .pot file, an empty template of strings yet to be translated
+## generate UI .pot file, an empty template of strings yet to be translated
 pot: $(UI_BUILD_FLAG_FILE)
 	$(NPM_BIN) --prefix awx/ui --loglevel warn run extract-template --clean

-# generate UI .po files for each locale (will update translated strings for `en`)
+## generate UI .po files for each locale (will update translated strings for `en`)
 po: $(UI_BUILD_FLAG_FILE)
 	$(NPM_BIN) --prefix awx/ui --loglevel warn run extract-strings -- --clean

-# generate API django .pot .po
-LANG = "en-us"
+LANG = "en_us"
+## generate API django .pot .po
 messages:
 	@if [ "$(VENV_BASE)" ]; then \
 		. $(VENV_BASE)/awx/bin/activate; \
@@ -585,3 +601,38 @@ messages:

 print-%:
 	@echo $($*)
+
+# HELP related targets
+# --------------------------------------
+
+HELP_FILTER=.PHONY
+
+## Display help targets
+help:
+	@printf "Available targets:\n"
+	@make -s help/generate | grep -vE "\w($(HELP_FILTER))"
+
+## Display help for all targets
+help/all:
+	@printf "Available targets:\n"
+	@make -s help/generate
+
+## Generate help output from MAKEFILE_LIST
+help/generate:
+	@awk '/^[-a-zA-Z_0-9%:\\\.\/]+:/ { \
+		helpMessage = match(lastLine, /^## (.*)/); \
+		if (helpMessage) { \
+			helpCommand = $$1; \
+			helpMessage = substr(lastLine, RSTART + 3, RLENGTH); \
+			gsub("\\\\", "", helpCommand); \
+			gsub(":+$$", "", helpCommand); \
+			printf "  \x1b[32;01m%-35s\x1b[0m %s\n", helpCommand, helpMessage; \
+		} else { \
+			helpCommand = $$1; \
+			gsub("\\\\", "", helpCommand); \
+			gsub(":+$$", "", helpCommand); \
+			printf "  \x1b[32;01m%-35s\x1b[0m %s\n", helpCommand, "No help available"; \
+		} \
+	} \
+	{ lastLine = $$0 }' $(MAKEFILE_LIST) | sort -u
+	@printf "\n"
--- a/awx/init.py
+++ b/awx/init.py
@@ -190,7 +190,7 @@ def manage():
        sys.stdout.write('%s\n' % __version__)
    # If running as a user without permission to read settings, display an
    # error message.  Allow --help to still work.
-    elif settings.SECRET_KEY == 'permission-denied':
+    elif not os.getenv('SKIP_SECRET_KEY_CHECK', False) and settings.SECRET_KEY == 'permission-denied':
        if len(sys.argv) == 1 or len(sys.argv) >= 2 and sys.argv[1] in ('-h', '--help', 'help'):
            execute_from_command_line(sys.argv)
            sys.stdout.write('\n')
--- a/awx/api/filters.py
+++ b/awx/api/filters.py
@@ -157,7 +157,7 @@ class FieldLookupBackend(BaseFilterBackend):

    # A list of fields that we know can be filtered on without the possiblity
    # of introducing duplicates
-    NO_DUPLICATES_ALLOW_LIST = (CharField, IntegerField, BooleanField)
+    NO_DUPLICATES_ALLOW_LIST = (CharField, IntegerField, BooleanField, TextField)

    def get_fields_from_lookup(self, model, lookup):

--- a/awx/api/serializers.py
+++ b/awx/api/serializers.py
@@ -154,6 +154,7 @@ SUMMARIZABLE_FK_FIELDS = {
    'source_project': DEFAULT_SUMMARY_FIELDS + ('status', 'scm_type'),
    'project_update': DEFAULT_SUMMARY_FIELDS + ('status', 'failed'),
    'credential': DEFAULT_SUMMARY_FIELDS + ('kind', 'cloud', 'kubernetes', 'credential_type_id'),
+    'signature_validation_credential': DEFAULT_SUMMARY_FIELDS + ('kind', 'credential_type_id'),
    'job': DEFAULT_SUMMARY_FIELDS + ('status', 'failed', 'elapsed', 'type', 'canceled_on'),
    'job_template': DEFAULT_SUMMARY_FIELDS,
    'workflow_job_template': DEFAULT_SUMMARY_FIELDS,
@@ -1470,6 +1471,7 @@ class ProjectSerializer(UnifiedJobTemplateSerializer, ProjectOptionsSerializer):
            'allow_override',
            'custom_virtualenv',
            'default_environment',
+            'signature_validation_credential',
        ) + (
            'last_update_failed',
            'last_updated',
@@ -4195,6 +4197,15 @@ class JobLaunchSerializer(BaseSerializer):
        elif template.project.status in ('error', 'failed'):
            errors['playbook'] = _("Missing a revision to run due to failed project update.")

+            latest_update = template.project.project_updates.last()
+            if latest_update is not None and latest_update.failed:
+                failed_validation_tasks = latest_update.project_update_events.filter(
+                    event='runner_on_failed',
+                    play="Perform project signature/checksum verification",
+                )
+                if failed_validation_tasks:
+                    errors['playbook'] = _("Last project update failed due to signature validation failure.")
+
        # cannot run a playbook without an inventory
        if template.inventory and template.inventory.pending_deletion is True:
            errors['inventory'] = _("The inventory associated with this Job Template is being deleted.")
@@ -4753,7 +4764,7 @@ class ScheduleSerializer(LaunchConfigurationBaseSerializer, SchedulePreviewSeria
 class InstanceLinkSerializer(BaseSerializer):
    class Meta:
        model = InstanceLink
-        fields = ('source', 'target', 'link_state')
+        fields = ('source', 'target')

    source = serializers.SlugRelatedField(slug_field="hostname", read_only=True)
    target = serializers.SlugRelatedField(slug_field="hostname", read_only=True)
@@ -4762,25 +4773,31 @@ class InstanceLinkSerializer(BaseSerializer):
 class InstanceNodeSerializer(BaseSerializer):
    class Meta:
        model = Instance
-        fields = ('id', 'hostname', 'node_type', 'node_state', 'enabled')
+        fields = ('id', 'hostname', 'node_type', 'node_state')
+
+    node_state = serializers.SerializerMethodField()
+
+    def get_node_state(self, obj):
+        if not obj.enabled:
+            return "disabled"
+        return "error" if obj.errors else "healthy"


 class InstanceSerializer(BaseSerializer):

    consumed_capacity = serializers.SerializerMethodField()
    percent_capacity_remaining = serializers.SerializerMethodField()
-    jobs_running = serializers.IntegerField(help_text=_('Count of jobs in the running or waiting state that are targeted for this instance'), read_only=True)
+    jobs_running = serializers.IntegerField(help_text=_('Count of jobs in the running or waiting state that ' 'are targeted for this instance'), read_only=True)
    jobs_total = serializers.IntegerField(help_text=_('Count of all jobs that target this instance'), read_only=True)

    class Meta:
        model = Instance
-        read_only_fields = ('uuid', 'hostname', 'version', 'node_type', 'node_state')
+        read_only_fields = ('uuid', 'hostname', 'version', 'node_type')
        fields = (
            "id",
            "type",
            "url",
            "related",
-            "summary_fields",
            "uuid",
            "hostname",
            "created",
@@ -4802,7 +4819,6 @@ class InstanceSerializer(BaseSerializer):
            "enabled",
            "managed_by_policy",
            "node_type",
-            "node_state",
        )

    def get_related(self, obj):
@@ -4814,14 +4830,6 @@ class InstanceSerializer(BaseSerializer):
                res['health_check'] = self.reverse('api:instance_health_check', kwargs={'pk': obj.pk})
        return res

-    def get_summary_fields(self, obj):
-        summary = super().get_summary_fields(obj)
-
-        if self.is_detail_view:
-            summary['links'] = InstanceLinkSerializer(InstanceLink.objects.select_related('target', 'source').filter(source=obj), many=True).data
-
-        return summary
-
    def get_consumed_capacity(self, obj):
        return obj.consumed_capacity

--- a/awx/api/urls/debug.py
+++ b/awx/api/urls/debug.py
@@ -0,0 +1,17 @@
+from django.urls import re_path
+
+from awx.api.views.debug import (
+    DebugRootView,
+    TaskManagerDebugView,
+    DependencyManagerDebugView,
+    WorkflowManagerDebugView,
+)
+
+urls = [
+    re_path(r'^$', DebugRootView.as_view(), name='debug'),
+    re_path(r'^task_manager/$', TaskManagerDebugView.as_view(), name='task_manager'),
+    re_path(r'^dependency_manager/$', DependencyManagerDebugView.as_view(), name='dependency_manager'),
+    re_path(r'^workflow_manager/$', WorkflowManagerDebugView.as_view(), name='workflow_manager'),
+]
+
+__all__ = ['urls']
--- a/awx/api/urls/urls.py
+++ b/awx/api/urls/urls.py
@@ -2,9 +2,9 @@
 # All Rights Reserved.

 from __future__ import absolute_import, unicode_literals
-from django.conf import settings
 from django.urls import include, re_path

+from awx import MODE
 from awx.api.generics import LoggedLoginView, LoggedLogoutView
 from awx.api.views import (
    ApiRootView,
@@ -145,7 +145,12 @@ urlpatterns = [
    re_path(r'^logout/$', LoggedLogoutView.as_view(next_page='/api/', redirect_field_name='next'), name='logout'),
    re_path(r'^o/', include(oauth2_root_urls)),
 ]
-if settings.SETTINGS_MODULE == 'awx.settings.development':
+if MODE == 'development':
+    # Only include these if we are in the development environment
    from awx.api.swagger import SwaggerSchemaView

    urlpatterns += [re_path(r'^swagger/$', SwaggerSchemaView.as_view(), name='swagger_view')]
+
+    from awx.api.urls.debug import urls as debug_urls
+
+    urlpatterns += [re_path(r'^debug/', include(debug_urls))]
--- a/awx/api/views/init.py
+++ b/awx/api/views/init.py
@@ -93,7 +93,7 @@ from awx.main.utils import (
    get_object_or_400,
    getattrd,
    get_pk_from_dict,
-    schedule_task_manager,
+    ScheduleWorkflowManager,
    ignore_inventory_computed_fields,
 )
 from awx.main.utils.encryption import encrypt_value
@@ -440,7 +440,6 @@ class InstanceHealthCheck(GenericAPIView):
    def post(self, request, *args, **kwargs):
        obj = self.get_object()

-        # Note: hop nodes are already excluded by the get_queryset method
        if obj.node_type == 'execution':
            from awx.main.tasks.system import execution_node_health_check

@@ -3392,7 +3391,7 @@ class WorkflowJobCancel(RetrieveAPIView):
        obj = self.get_object()
        if obj.can_cancel:
            obj.cancel()
-            schedule_task_manager()
+            ScheduleWorkflowManager().schedule()
            return Response(status=status.HTTP_202_ACCEPTED)
        else:
            return self.http_method_not_allowed(request, *args, **kwargs)
@@ -3840,7 +3839,7 @@ class JobJobEventsList(BaseJobEventsList):
    def get_queryset(self):
        job = self.get_parent_object()
        self.check_parent_access(job)
-        return job.get_event_queryset().select_related('host').order_by('start_line')
+        return job.get_event_queryset().prefetch_related('job__job_template', 'host').order_by('start_line')


 class JobJobEventsChildrenSummary(APIView):
--- a/awx/api/views/debug.py
+++ b/awx/api/views/debug.py
@@ -0,0 +1,68 @@
+from collections import OrderedDict
+
+from django.conf import settings
+
+from rest_framework.permissions import AllowAny
+from rest_framework.response import Response
+from awx.api.generics import APIView
+
+from awx.main.scheduler import TaskManager, DependencyManager, WorkflowManager
+
+
+class TaskManagerDebugView(APIView):
+    _ignore_model_permissions = True
+    exclude_from_schema = True
+    permission_classes = [AllowAny]
+    prefix = 'Task'
+
+    def get(self, request):
+        TaskManager().schedule()
+        if not settings.AWX_DISABLE_TASK_MANAGERS:
+            msg = f"Running {self.prefix} manager. To disable other triggers to the {self.prefix} manager, set AWX_DISABLE_TASK_MANAGERS to True"
+        else:
+            msg = f"AWX_DISABLE_TASK_MANAGERS is True, this view is the only way to trigger the {self.prefix} manager"
+        return Response(msg)
+
+
+class DependencyManagerDebugView(APIView):
+    _ignore_model_permissions = True
+    exclude_from_schema = True
+    permission_classes = [AllowAny]
+    prefix = 'Dependency'
+
+    def get(self, request):
+        DependencyManager().schedule()
+        if not settings.AWX_DISABLE_TASK_MANAGERS:
+            msg = f"Running {self.prefix} manager. To disable other triggers to the {self.prefix} manager, set AWX_DISABLE_TASK_MANAGERS to True"
+        else:
+            msg = f"AWX_DISABLE_TASK_MANAGERS is True, this view is the only way to trigger the {self.prefix} manager"
+        return Response(msg)
+
+
+class WorkflowManagerDebugView(APIView):
+    _ignore_model_permissions = True
+    exclude_from_schema = True
+    permission_classes = [AllowAny]
+    prefix = 'Workflow'
+
+    def get(self, request):
+        WorkflowManager().schedule()
+        if not settings.AWX_DISABLE_TASK_MANAGERS:
+            msg = f"Running {self.prefix} manager. To disable other triggers to the {self.prefix} manager, set AWX_DISABLE_TASK_MANAGERS to True"
+        else:
+            msg = f"AWX_DISABLE_TASK_MANAGERS is True, this view is the only way to trigger the {self.prefix} manager"
+        return Response(msg)
+
+
+class DebugRootView(APIView):
+    _ignore_model_permissions = True
+    exclude_from_schema = True
+    permission_classes = [AllowAny]
+
+    def get(self, request, format=None):
+        '''List of available debug urls'''
+        data = OrderedDict()
+        data['task_manager'] = '/api/debug/task_manager/'
+        data['dependency_manager'] = '/api/debug/dependency_manager/'
+        data['workflow_manager'] = '/api/debug/workflow_manager/'
+        return Response(data)
--- a/awx/conf/settings.py
+++ b/awx/conf/settings.py
@@ -80,7 +80,7 @@ def _ctit_db_wrapper(trans_safe=False):
        yield
    except DBError as exc:
        if trans_safe:
-            level = logger.exception
+            level = logger.warning
            if isinstance(exc, ProgrammingError):
                if 'relation' in str(exc) and 'does not exist' in str(exc):
                    # this generally means we can't fetch Tower configuration
@@ -89,7 +89,7 @@ def _ctit_db_wrapper(trans_safe=False):
                    # has come up *before* the database has finished migrating, and
                    # especially that the conf.settings table doesn't exist yet
                    level = logger.debug
-            level('Database settings are not available, using defaults.')
+            level(f'Database settings are not available, using defaults. error: {str(exc)}')
        else:
            logger.exception('Error modifying something related to database settings.')
    finally:
--- a/awx/main/analytics/collectors.py
+++ b/awx/main/analytics/collectors.py
@@ -16,6 +16,7 @@ from awx.conf.license import get_license
 from awx.main.utils import get_awx_version, camelcase_to_underscore, datetime_hook
 from awx.main import models
 from awx.main.analytics import register
+from awx.main.scheduler.task_manager_models import TaskManagerInstances

 """
 This module is used to define metrics collected by awx.main.analytics.gather()
@@ -235,25 +236,25 @@ def projects_by_scm_type(since, **kwargs):
@register('instance_info', '1.2', description=_('Cluster topology and capacity'))
 def instance_info(since, include_hostnames=False, **kwargs):
    info = {}
-    instances = models.Instance.objects.values_list('hostname').values(
-        'uuid', 'version', 'capacity', 'cpu', 'memory', 'managed_by_policy', 'hostname', 'enabled'
-    )
-    for instance in instances:
-        consumed_capacity = sum(x.task_impact for x in models.UnifiedJob.objects.filter(execution_node=instance['hostname'], status__in=('running', 'waiting')))
+    # Use same method that the TaskManager does to compute consumed capacity without querying all running jobs for each Instance
+    active_tasks = models.UnifiedJob.objects.filter(status__in=['running', 'waiting']).only('task_impact', 'controller_node', 'execution_node')
+    tm_instances = TaskManagerInstances(active_tasks, instance_fields=['uuid', 'version', 'capacity', 'cpu', 'memory', 'managed_by_policy', 'enabled'])
+    for tm_instance in tm_instances.instances_by_hostname.values():
+        instance = tm_instance.obj
        instance_info = {
-            'uuid': instance['uuid'],
-            'version': instance['version'],
-            'capacity': instance['capacity'],
-            'cpu': instance['cpu'],
-            'memory': instance['memory'],
-            'managed_by_policy': instance['managed_by_policy'],
-            'enabled': instance['enabled'],
-            'consumed_capacity': consumed_capacity,
-            'remaining_capacity': instance['capacity'] - consumed_capacity,
+            'uuid': instance.uuid,
+            'version': instance.version,
+            'capacity': instance.capacity,
+            'cpu': instance.cpu,
+            'memory': instance.memory,
+            'managed_by_policy': instance.managed_by_policy,
+            'enabled': instance.enabled,
+            'consumed_capacity': tm_instance.consumed_capacity,
+            'remaining_capacity': instance.capacity - tm_instance.consumed_capacity,
        }
        if include_hostnames is True:
-            instance_info['hostname'] = instance['hostname']
-        info[instance['uuid']] = instance_info
+            instance_info['hostname'] = instance.hostname
+        info[instance.uuid] = instance_info
    return info


--- a/awx/main/analytics/subsystem_metrics.py
+++ b/awx/main/analytics/subsystem_metrics.py
@@ -166,7 +166,11 @@ class Metrics:
        elif settings.IS_TESTING():
            self.instance_name = "awx_testing"
        else:
-            self.instance_name = Instance.objects.me().hostname
+            try:
+                self.instance_name = Instance.objects.me().hostname
+            except Exception as e:
+                self.instance_name = settings.CLUSTER_HOST_ID
+                logger.info(f'Instance {self.instance_name} seems to be unregistered, error: {e}')

        # metric name, help_text
        METRICSLIST = [
@@ -184,19 +188,29 @@ class Metrics:
            FloatM('subsystem_metrics_pipe_execute_seconds', 'Time spent saving metrics to redis'),
            IntM('subsystem_metrics_pipe_execute_calls', 'Number of calls to pipe_execute'),
            FloatM('subsystem_metrics_send_metrics_seconds', 'Time spent sending metrics to other nodes'),
-            SetFloatM('task_manager_get_tasks_seconds', 'Time spent in loading all tasks from db'),
+            SetFloatM('task_manager_get_tasks_seconds', 'Time spent in loading tasks from db'),
            SetFloatM('task_manager_start_task_seconds', 'Time spent starting task'),
            SetFloatM('task_manager_process_running_tasks_seconds', 'Time spent processing running tasks'),
            SetFloatM('task_manager_process_pending_tasks_seconds', 'Time spent processing pending tasks'),
-            SetFloatM('task_manager_generate_dependencies_seconds', 'Time spent generating dependencies for pending tasks'),
-            SetFloatM('task_manager_spawn_workflow_graph_jobs_seconds', 'Time spent spawning workflow jobs'),
            SetFloatM('task_manager__schedule_seconds', 'Time spent in running the entire _schedule'),
-            IntM('task_manager_schedule_calls', 'Number of calls to task manager schedule'),
+            IntM('task_manager__schedule_calls', 'Number of calls to _schedule, after lock is acquired'),
            SetFloatM('task_manager_recorded_timestamp', 'Unix timestamp when metrics were last recorded'),
            SetIntM('task_manager_tasks_started', 'Number of tasks started'),
            SetIntM('task_manager_running_processed', 'Number of running tasks processed'),
            SetIntM('task_manager_pending_processed', 'Number of pending tasks processed'),
            SetIntM('task_manager_tasks_blocked', 'Number of tasks blocked from running'),
+            SetFloatM('task_manager_commit_seconds', 'Time spent in db transaction, including on_commit calls'),
+            SetFloatM('dependency_manager_get_tasks_seconds', 'Time spent loading pending tasks from db'),
+            SetFloatM('dependency_manager_generate_dependencies_seconds', 'Time spent generating dependencies for pending tasks'),
+            SetFloatM('dependency_manager__schedule_seconds', 'Time spent in running the entire _schedule'),
+            IntM('dependency_manager__schedule_calls', 'Number of calls to _schedule, after lock is acquired'),
+            SetFloatM('dependency_manager_recorded_timestamp', 'Unix timestamp when metrics were last recorded'),
+            SetIntM('dependency_manager_pending_processed', 'Number of pending tasks processed'),
+            SetFloatM('workflow_manager__schedule_seconds', 'Time spent in running the entire _schedule'),
+            IntM('workflow_manager__schedule_calls', 'Number of calls to _schedule, after lock is acquired'),
+            SetFloatM('workflow_manager_recorded_timestamp', 'Unix timestamp when metrics were last recorded'),
+            SetFloatM('workflow_manager_spawn_workflow_graph_jobs_seconds', 'Time spent spawning workflow tasks'),
+            SetFloatM('workflow_manager_get_tasks_seconds', 'Time spent loading workflow tasks from db'),
        ]
        # turn metric list into dictionary with the metric name as a key
        self.METRICS = {}
@@ -303,7 +317,12 @@ class Metrics:
                self.previous_send_metrics.set(current_time)
                self.previous_send_metrics.store_value(self.conn)
        finally:
-            lock.release()
+            try:
+                lock.release()
+            except Exception as exc:
+                # After system failures, we might throw redis.exceptions.LockNotOwnedError
+                # this is to avoid print a Traceback, and importantly, avoid raising an exception into parent context
+                logger.warning(f'Error releasing subsystem metrics redis lock, error: {str(exc)}')

    def load_other_metrics(self, request):
        # data received from other nodes are stored in their own keys
--- a/awx/main/conf.py
+++ b/awx/main/conf.py
@@ -446,7 +446,7 @@ register(
    label=_('Default Job Idle Timeout'),
    help_text=_(
        'If no output is detected from ansible in this number of seconds the execution will be terminated. '
-        'Use value of 0 to used default idle_timeout is 600s.'
+        'Use value of 0 to indicate that no idle timeout should be imposed.'
    ),
    category=_('Jobs'),
    category_slug='jobs',
--- a/awx/main/dispatch/init.py
+++ b/awx/main/dispatch/init.py
@@ -4,6 +4,7 @@ import select
 from contextlib import contextmanager

 from django.conf import settings
+from django.db import connection as pg_connection


 NOT_READY = ([], [], [])
@@ -15,7 +16,6 @@ def get_local_queuename():

 class PubSub(object):
    def __init__(self, conn):
-        assert conn.autocommit, "Connection must be in autocommit mode."
        self.conn = conn

    def listen(self, channel):
@@ -31,6 +31,9 @@ class PubSub(object):
            cur.execute('SELECT pg_notify(%s, %s);', (channel, payload))

    def events(self, select_timeout=5, yield_timeouts=False):
+        if not self.conn.autocommit:
+            raise RuntimeError('Listening for events can only be done in autocommit mode')
+
        while True:
            if select.select([self.conn], [], [], select_timeout) == NOT_READY:
                if yield_timeouts:
@@ -45,11 +48,32 @@ class PubSub(object):


@contextmanager
-def pg_bus_conn():
-    conf = settings.DATABASES['default']
-    conn = psycopg2.connect(dbname=conf['NAME'], host=conf['HOST'], user=conf['USER'], password=conf['PASSWORD'], port=conf['PORT'], **conf.get("OPTIONS", {}))
-    # Django connection.cursor().connection doesn't have autocommit=True on
-    conn.set_session(autocommit=True)
+def pg_bus_conn(new_connection=False):
+    '''
+    Any listeners probably want to establish a new database connection,
+    separate from the Django connection used for queries, because that will prevent
+    losing connection to the channel whenever a .close() happens.
+
+    Any publishers probably want to use the existing connection
+    so that messages follow postgres transaction rules
+    https://www.postgresql.org/docs/current/sql-notify.html
+    '''
+
+    if new_connection:
+        conf = settings.DATABASES['default']
+        conn = psycopg2.connect(
+            dbname=conf['NAME'], host=conf['HOST'], user=conf['USER'], password=conf['PASSWORD'], port=conf['PORT'], **conf.get("OPTIONS", {})
+        )
+        # Django connection.cursor().connection doesn't have autocommit=True on by default
+        conn.set_session(autocommit=True)
+    else:
+        if pg_connection.connection is None:
+            pg_connection.connect()
+        if pg_connection.connection is None:
+            raise RuntimeError('Unexpectedly could not connect to postgres for pg_notify actions')
+        conn = pg_connection.connection
+
    pubsub = PubSub(conn)
    yield pubsub
-    conn.close()
+    if new_connection:
+        conn.close()
--- a/awx/main/dispatch/control.py
+++ b/awx/main/dispatch/control.py
@@ -37,18 +37,24 @@ class Control(object):
    def running(self, *args, **kwargs):
        return self.control_with_reply('running', *args, **kwargs)

+    def cancel(self, task_ids, *args, **kwargs):
+        return self.control_with_reply('cancel', *args, extra_data={'task_ids': task_ids}, **kwargs)
+
    @classmethod
    def generate_reply_queue_name(cls):
        return f"reply_to_{str(uuid.uuid4()).replace('-','_')}"

-    def control_with_reply(self, command, timeout=5):
+    def control_with_reply(self, command, timeout=5, extra_data=None):
        logger.warning('checking {} {} for {}'.format(self.service, command, self.queuename))
        reply_queue = Control.generate_reply_queue_name()
        self.result = None

-        with pg_bus_conn() as conn:
+        with pg_bus_conn(new_connection=True) as conn:
            conn.listen(reply_queue)
-            conn.notify(self.queuename, json.dumps({'control': command, 'reply_to': reply_queue}))
+            send_data = {'control': command, 'reply_to': reply_queue}
+            if extra_data:
+                send_data.update(extra_data)
+            conn.notify(self.queuename, json.dumps(send_data))

            for reply in conn.events(select_timeout=timeout, yield_timeouts=True):
                if reply is None:
--- a/awx/main/dispatch/pool.py
+++ b/awx/main/dispatch/pool.py
@@ -16,13 +16,14 @@ from queue import Full as QueueFull, Empty as QueueEmpty
 from django.conf import settings
 from django.db import connection as django_connection, connections
 from django.core.cache import cache as django_cache
+from django.utils.timezone import now as tz_now
 from django_guid import set_guid
 from jinja2 import Template
 import psutil

 from awx.main.models import UnifiedJob
 from awx.main.dispatch import reaper
-from awx.main.utils.common import convert_mem_str_to_bytes, get_mem_effective_capacity
+from awx.main.utils.common import convert_mem_str_to_bytes, get_mem_effective_capacity, log_excess_runtime

 if 'run_callback_receiver' in sys.argv:
    logger = logging.getLogger('awx.main.commands.run_callback_receiver')
@@ -328,12 +329,16 @@ class AutoscalePool(WorkerPool):
            # Get same number as max forks based on memory, this function takes memory as bytes
            self.max_workers = get_mem_effective_capacity(total_memory_gb * 2**30)

+            # add magic prime number of extra workers to ensure
+            # we have a few extra workers to run the heartbeat
+            self.max_workers += 7
+
        # max workers can't be less than min_workers
        self.max_workers = max(self.min_workers, self.max_workers)

-    def debug(self, *args, **kwargs):
-        self.cleanup()
-        return super(AutoscalePool, self).debug(*args, **kwargs)
+        # the task manager enforces settings.TASK_MANAGER_TIMEOUT on its own
+        # but if the task takes longer than the time defined here, we will force it to stop here
+        self.task_manager_timeout = settings.TASK_MANAGER_TIMEOUT + settings.TASK_MANAGER_TIMEOUT_GRACE_PERIOD

    @property
    def should_grow(self):
@@ -351,6 +356,7 @@ class AutoscalePool(WorkerPool):
    def debug_meta(self):
        return 'min={} max={}'.format(self.min_workers, self.max_workers)

+    @log_excess_runtime(logger)
    def cleanup(self):
        """
        Perform some internal account and cleanup.  This is run on
@@ -359,8 +365,6 @@ class AutoscalePool(WorkerPool):
        1.  Discover worker processes that exited, and recover messages they
            were handling.
        2.  Clean up unnecessary, idle workers.
-        3.  Check to see if the database says this node is running any tasks
-            that aren't actually running.  If so, reap them.

        IMPORTANT: this function is one of the few places in the dispatcher
        (aside from setting lookups) where we talk to the database.  As such,
@@ -401,13 +405,15 @@ class AutoscalePool(WorkerPool):
                # the task manager to never do more work
                current_task = w.current_task
                if current_task and isinstance(current_task, dict):
-                    if current_task.get('task', '').endswith('tasks.run_task_manager'):
+                    endings = ['tasks.task_manager', 'tasks.dependency_manager', 'tasks.workflow_manager']
+                    current_task_name = current_task.get('task', '')
+                    if any(current_task_name.endswith(e) for e in endings):
                        if 'started' not in current_task:
                            w.managed_tasks[current_task['uuid']]['started'] = time.time()
                        age = time.time() - current_task['started']
                        w.managed_tasks[current_task['uuid']]['age'] = age
-                        if age > (60 * 5):
-                            logger.error(f'run_task_manager has held the advisory lock for >5m, sending SIGTERM to {w.pid}')  # noqa
+                        if age > self.task_manager_timeout:
+                            logger.error(f'{current_task_name} has held the advisory lock for {age}, sending SIGTERM to {w.pid}')
                            os.kill(w.pid, signal.SIGTERM)

        for m in orphaned:
@@ -417,13 +423,17 @@ class AutoscalePool(WorkerPool):
            idx = random.choice(range(len(self.workers)))
            self.write(idx, m)

-        # if the database says a job is running on this node, but it's *not*,
-        # then reap it
-        running_uuids = []
-        for worker in self.workers:
-            worker.calculate_managed_tasks()
-            running_uuids.extend(list(worker.managed_tasks.keys()))
-        reaper.reap(excluded_uuids=running_uuids)
+    def add_bind_kwargs(self, body):
+        bind_kwargs = body.pop('bind_kwargs', [])
+        body.setdefault('kwargs', {})
+        if 'dispatch_time' in bind_kwargs:
+            body['kwargs']['dispatch_time'] = tz_now().isoformat()
+        if 'worker_tasks' in bind_kwargs:
+            worker_tasks = {}
+            for worker in self.workers:
+                worker.calculate_managed_tasks()
+                worker_tasks[worker.pid] = list(worker.managed_tasks.keys())
+            body['kwargs']['worker_tasks'] = worker_tasks

    def up(self):
        if self.full:
@@ -438,6 +448,8 @@ class AutoscalePool(WorkerPool):
        if 'guid' in body:
            set_guid(body['guid'])
        try:
+            if isinstance(body, dict) and body.get('bind_kwargs'):
+                self.add_bind_kwargs(body)
            # when the cluster heartbeat occurs, clean up internally
            if isinstance(body, dict) and 'cluster_node_heartbeat' in body['task']:
                self.cleanup()
@@ -452,6 +464,10 @@ class AutoscalePool(WorkerPool):
                    w.put(body)
                    break
            else:
+                task_name = 'unknown'
+                if isinstance(body, dict):
+                    task_name = body.get('task')
+                logger.warn(f'Workers maxed, queuing {task_name}, load: {sum(len(w.managed_tasks) for w in self.workers)} / {len(self.workers)}')
                return super(AutoscalePool, self).write(preferred_queue, body)
        except Exception:
            for conn in connections.all():
--- a/awx/main/dispatch/publish.py
+++ b/awx/main/dispatch/publish.py
@@ -2,6 +2,7 @@ import inspect
 import logging
 import sys
 import json
+import time
 from uuid import uuid4

 from django.conf import settings
@@ -49,13 +50,21 @@ class task:
    @task(queue='tower_broadcast')
    def announce():
        print("Run this everywhere!")
+
+    # The special parameter bind_kwargs tells the main dispatcher process to add certain kwargs
+
+    @task(bind_kwargs=['dispatch_time'])
+    def print_time(dispatch_time=None):
+        print(f"Time I was dispatched: {dispatch_time}")
    """

-    def __init__(self, queue=None):
+    def __init__(self, queue=None, bind_kwargs=None):
        self.queue = queue
+        self.bind_kwargs = bind_kwargs

    def __call__(self, fn=None):
        queue = self.queue
+        bind_kwargs = self.bind_kwargs

        class PublisherMixin(object):

@@ -75,10 +84,12 @@ class task:
                    msg = f'{cls.name}: Queue value required and may not be None'
                    logger.error(msg)
                    raise ValueError(msg)
-                obj = {'uuid': task_id, 'args': args, 'kwargs': kwargs, 'task': cls.name}
+                obj = {'uuid': task_id, 'args': args, 'kwargs': kwargs, 'task': cls.name, 'time_pub': time.time()}
                guid = get_guid()
                if guid:
                    obj['guid'] = guid
+                if bind_kwargs:
+                    obj['bind_kwargs'] = bind_kwargs
                obj.update(**kw)
                if callable(queue):
                    queue = queue()
--- a/awx/main/dispatch/reaper.py
+++ b/awx/main/dispatch/reaper.py
@@ -2,6 +2,7 @@ from datetime import timedelta
 import logging

 from django.db.models import Q
+from django.conf import settings
 from django.utils.timezone import now as tz_now
 from django.contrib.contenttypes.models import ContentType

@@ -15,44 +16,71 @@ def startup_reaping():
    If this particular instance is starting, then we know that any running jobs are invalid
    so we will reap those jobs as a special action here
    """
-    me = Instance.objects.me()
+    try:
+        me = Instance.objects.me()
+    except RuntimeError as e:
+        logger.warning(f'Local instance is not registered, not running startup reaper: {e}')
+        return
    jobs = UnifiedJob.objects.filter(status='running', controller_node=me.hostname)
    job_ids = []
    for j in jobs:
        job_ids.append(j.id)
-        j.status = 'failed'
-        j.start_args = ''
-        j.job_explanation += 'Task was marked as running at system start up. The system must have not shut down properly, so it has been marked as failed.'
-        j.save(update_fields=['status', 'start_args', 'job_explanation'])
-        if hasattr(j, 'send_notification_templates'):
-            j.send_notification_templates('failed')
-        j.websocket_emit_status('failed')
+        reap_job(
+            j,
+            'failed',
+            job_explanation='Task was marked as running at system start up. The system must have not shut down properly, so it has been marked as failed.',
+        )
    if job_ids:
        logger.error(f'Unified jobs {job_ids} were reaped on dispatch startup')


-def reap_job(j, status):
-    if UnifiedJob.objects.get(id=j.id).status not in ('running', 'waiting'):
+def reap_job(j, status, job_explanation=None):
+    j.refresh_from_db(fields=['status', 'job_explanation'])
+    status_before = j.status
+    if status_before not in ('running', 'waiting'):
        # just in case, don't reap jobs that aren't running
        return
    j.status = status
    j.start_args = ''  # blank field to remove encrypted passwords
-    j.job_explanation += ' '.join(
-        (
-            'Task was marked as running but was not present in',
-            'the job queue, so it has been marked as failed.',
-        )
-    )
+    if j.job_explanation:
+        j.job_explanation += ' '  # Separate messages for readability
+    if job_explanation is None:
+        j.job_explanation += 'Task was marked as running but was not present in the job queue, so it has been marked as failed.'
+    else:
+        j.job_explanation += job_explanation
    j.save(update_fields=['status', 'start_args', 'job_explanation'])
    if hasattr(j, 'send_notification_templates'):
        j.send_notification_templates('failed')
    j.websocket_emit_status(status)
-    logger.error('{} is no longer running; reaping'.format(j.log_format))
+    logger.error(f'{j.log_format} is no longer {status_before}; reaping')


-def reap(instance=None, status='failed', excluded_uuids=[]):
+def reap_waiting(instance=None, status='failed', job_explanation=None, grace_period=None, excluded_uuids=None, ref_time=None):
    """
-    Reap all jobs in waiting|running for this instance.
+    Reap all jobs in waiting for this instance.
+    """
+    if grace_period is None:
+        grace_period = settings.JOB_WAITING_GRACE_PERIOD + settings.TASK_MANAGER_TIMEOUT
+
+    me = instance
+    if me is None:
+        try:
+            me = Instance.objects.me()
+        except RuntimeError as e:
+            logger.warning(f'Local instance is not registered, not running reaper: {e}')
+            return
+    if ref_time is None:
+        ref_time = tz_now()
+    jobs = UnifiedJob.objects.filter(status='waiting', modified__lte=ref_time - timedelta(seconds=grace_period), controller_node=me.hostname)
+    if excluded_uuids:
+        jobs = jobs.exclude(celery_task_id__in=excluded_uuids)
+    for j in jobs:
+        reap_job(j, status, job_explanation=job_explanation)
+
+
+def reap(instance=None, status='failed', job_explanation=None, excluded_uuids=None):
+    """
+    Reap all jobs in running for this instance.
    """
    me = instance
    if me is None:
@@ -61,12 +89,11 @@ def reap(instance=None, status='failed', excluded_uuids=[]):
        except RuntimeError as e:
            logger.warning(f'Local instance is not registered, not running reaper: {e}')
            return
-    now = tz_now()
    workflow_ctype_id = ContentType.objects.get_for_model(WorkflowJob).id
    jobs = UnifiedJob.objects.filter(
-        (Q(status='running') | Q(status='waiting', modified__lte=now - timedelta(seconds=60)))
-        & (Q(execution_node=me.hostname) | Q(controller_node=me.hostname))
-        & ~Q(polymorphic_ctype_id=workflow_ctype_id)
-    ).exclude(celery_task_id__in=excluded_uuids)
+        Q(status='running') & (Q(execution_node=me.hostname) | Q(controller_node=me.hostname)) & ~Q(polymorphic_ctype_id=workflow_ctype_id)
+    )
+    if excluded_uuids:
+        jobs = jobs.exclude(celery_task_id__in=excluded_uuids)
    for j in jobs:
-        reap_job(j, status)
+        reap_job(j, status, job_explanation=job_explanation)
--- a/awx/main/dispatch/worker/base.py
+++ b/awx/main/dispatch/worker/base.py
@@ -17,6 +17,7 @@ from django.conf import settings

 from awx.main.dispatch.pool import WorkerPool
 from awx.main.dispatch import pg_bus_conn
+from awx.main.utils.common import log_excess_runtime

 if 'run_callback_receiver' in sys.argv:
    logger = logging.getLogger('awx.main.commands.run_callback_receiver')
@@ -62,7 +63,7 @@ class AWXConsumerBase(object):
    def control(self, body):
        logger.warning(f'Received control signal:\n{body}')
        control = body.get('control')
-        if control in ('status', 'running'):
+        if control in ('status', 'running', 'cancel'):
            reply_queue = body['reply_to']
            if control == 'status':
                msg = '\n'.join([self.listening_on, self.pool.debug()])
@@ -71,6 +72,17 @@ class AWXConsumerBase(object):
                for worker in self.pool.workers:
                    worker.calculate_managed_tasks()
                    msg.extend(worker.managed_tasks.keys())
+            elif control == 'cancel':
+                msg = []
+                task_ids = set(body['task_ids'])
+                for worker in self.pool.workers:
+                    task = worker.current_task
+                    if task and task['uuid'] in task_ids:
+                        logger.warn(f'Sending SIGTERM to task id={task["uuid"]}, task={task.get("task")}, args={task.get("args")}')
+                        os.kill(worker.pid, signal.SIGTERM)
+                        msg.append(task['uuid'])
+                if task_ids and not msg:
+                    logger.info(f'Could not locate running tasks to cancel with ids={task_ids}')

            with pg_bus_conn() as conn:
                conn.notify(reply_queue, json.dumps(msg))
@@ -81,6 +93,9 @@ class AWXConsumerBase(object):
            logger.error('unrecognized control message: {}'.format(control))

    def process_task(self, body):
+        if isinstance(body, dict):
+            body['time_ack'] = time.time()
+
        if 'control' in body:
            try:
                return self.control(body)
@@ -101,6 +116,7 @@ class AWXConsumerBase(object):
        self.total_messages += 1
        self.record_statistics()

+    @log_excess_runtime(logger)
    def record_statistics(self):
        if time.time() - self.last_stats > 1:  # buffer stat recording to once per second
            try:
@@ -149,7 +165,7 @@ class AWXConsumerPG(AWXConsumerBase):

        while True:
            try:
-                with pg_bus_conn() as conn:
+                with pg_bus_conn(new_connection=True) as conn:
                    for queue in self.queues:
                        conn.listen(queue)
                    if init is False:
--- a/awx/main/dispatch/worker/callback.py
+++ b/awx/main/dispatch/worker/callback.py
@@ -167,17 +167,27 @@ class CallbackBrokerWorker(BaseWorker):
                try:
                    cls.objects.bulk_create(events)
                    metrics_bulk_events_saved += len(events)
-                except Exception:
+                except Exception as exc:
+                    logger.warning(f'Error in events bulk_create, will try indiviually up to 5 errors, error {str(exc)}')
                    # if an exception occurs, we should re-attempt to save the
                    # events one-by-one, because something in the list is
                    # broken/stale
+                    consecutive_errors = 0
+                    events_saved = 0
                    metrics_events_batch_save_errors += 1
                    for e in events:
                        try:
                            e.save()
-                            metrics_singular_events_saved += 1
-                        except Exception:
-                            logger.exception('Database Error Saving Job Event')
+                            events_saved += 1
+                            consecutive_errors = 0
+                        except Exception as exc_indv:
+                            consecutive_errors += 1
+                            logger.info(f'Database Error Saving individual Job Event, error {str(exc_indv)}')
+                        if consecutive_errors >= 5:
+                            raise
+                    metrics_singular_events_saved += events_saved
+                    if events_saved == 0:
+                        raise
                metrics_duration_to_save = time.perf_counter() - metrics_duration_to_save
                for e in events:
                    if not getattr(e, '_skip_websocket_message', False):
@@ -257,17 +267,18 @@ class CallbackBrokerWorker(BaseWorker):
                try:
                    self.flush(force=flush)
                    break
-                except (OperationalError, InterfaceError, InternalError):
+                except (OperationalError, InterfaceError, InternalError) as exc:
                    if retries >= self.MAX_RETRIES:
                        logger.exception('Worker could not re-establish database connectivity, giving up on one or more events.')
                        return
                    delay = 60 * retries
-                    logger.exception('Database Error Saving Job Event, retry #{i} in {delay} seconds:'.format(i=retries + 1, delay=delay))
+                    logger.warning(f'Database Error Flushing Job Events, retry #{retries + 1} in {delay} seconds: {str(exc)}')
                    django_connection.close()
                    time.sleep(delay)
                    retries += 1
                except DatabaseError:
-                    logger.exception('Database Error Saving Job Event')
+                    logger.exception('Database Error Flushing Job Events')
+                    django_connection.close()
                    break
        except Exception as exc:
            tb = traceback.format_exc()
--- a/awx/main/dispatch/worker/task.py
+++ b/awx/main/dispatch/worker/task.py
@@ -3,6 +3,7 @@ import logging
 import importlib
 import sys
 import traceback
+import time

 from kubernetes.config import kube_config

@@ -60,8 +61,19 @@ class TaskWorker(BaseWorker):
            # the callable is a class, e.g., RunJob; instantiate and
            # return its `run()` method
            _call = _call().run
+
+        log_extra = ''
+        logger_method = logger.debug
+        if ('time_ack' in body) and ('time_pub' in body):
+            time_publish = body['time_ack'] - body['time_pub']
+            time_waiting = time.time() - body['time_ack']
+            if time_waiting > 5.0 or time_publish > 5.0:
+                # If task too a very long time to process, add this information to the log
+                log_extra = f' took {time_publish:.4f} to ack, {time_waiting:.4f} in local dispatcher'
+                logger_method = logger.info
        # don't print kwargs, they often contain launch-time secrets
-        logger.debug('task {} starting {}(*{})'.format(uuid, task, args))
+        logger_method(f'task {uuid} starting {task}(*{args}){log_extra}')
+
        return _call(*args, **kwargs)

    def perform_work(self, body):
--- a/awx/main/management/commands/inventory_import.py
+++ b/awx/main/management/commands/inventory_import.py
@@ -862,7 +862,7 @@ class Command(BaseCommand):
                    overwrite_vars=bool(options.get('overwrite_vars', False)),
                )
                inventory_update = inventory_source.create_inventory_update(
-                    _eager_fields=dict(job_args=json.dumps(sys.argv), job_env=dict(os.environ.items()), job_cwd=os.getcwd())
+                    _eager_fields=dict(status='running', job_args=json.dumps(sys.argv), job_env=dict(os.environ.items()), job_cwd=os.getcwd())
                )

            data = AnsibleInventoryLoader(source=source, verbosity=verbosity).load()
--- a/awx/main/management/commands/list_instances.py
+++ b/awx/main/management/commands/list_instances.py
@@ -54,7 +54,7 @@ class Command(BaseCommand):

                capacity = f' capacity={x.capacity}' if x.node_type != 'hop' else ''
                version = f" version={x.version or '?'}" if x.node_type != 'hop' else ''
-                heartbeat = f' heartbeat="{x.modified:%Y-%m-%d %H:%M:%S}"' if x.capacity or x.node_type == 'hop' else ''
+                heartbeat = f' heartbeat="{x.last_seen:%Y-%m-%d %H:%M:%S}"' if x.capacity or x.node_type == 'hop' else ''
                print(f'\t{color}{x.hostname}{capacity} node_type={x.node_type}{version}{heartbeat}\033[0m')

            print()
--- a/awx/main/management/commands/register_peers.py
+++ b/awx/main/management/commands/register_peers.py
@@ -27,9 +27,7 @@ class Command(BaseCommand):
        )

    def handle(self, **options):
-        # provides a mapping of hostname to Instance objects
        nodes = Instance.objects.in_bulk(field_name='hostname')
-
        if options['source'] not in nodes:
            raise CommandError(f"Host {options['source']} is not a registered instance.")
        if not (options['peers'] or options['disconnect'] or options['exact'] is not None):
@@ -59,9 +57,7 @@ class Command(BaseCommand):

            results = 0
            for target in options['peers']:
-                _, created = InstanceLink.objects.update_or_create(
-                    source=nodes[options['source']], target=nodes[target], defaults={'link_state': InstanceLink.States.ESTABLISHED}
-                )
+                _, created = InstanceLink.objects.get_or_create(source=nodes[options['source']], target=nodes[target])
                if created:
                    results += 1

@@ -84,9 +80,7 @@ class Command(BaseCommand):
                links = set(InstanceLink.objects.filter(source=nodes[options['source']]).values_list('target__hostname', flat=True))
                removals, _ = InstanceLink.objects.filter(source=nodes[options['source']], target__hostname__in=links - peers).delete()
                for target in peers - links:
-                    _, created = InstanceLink.objects.update_or_create(
-                        source=nodes[options['source']], target=nodes[target], defaults={'link_state': InstanceLink.States.ESTABLISHED}
-                    )
+                    _, created = InstanceLink.objects.get_or_create(source=nodes[options['source']], target=nodes[target])
                    if created:
                        additions += 1

--- a/awx/main/management/commands/run_dispatcher.py
+++ b/awx/main/management/commands/run_dispatcher.py
@@ -1,13 +1,14 @@
 # Copyright (c) 2015 Ansible, Inc.
 # All Rights Reserved.
 import logging
+import yaml

 from django.conf import settings
 from django.core.cache import cache as django_cache
 from django.core.management.base import BaseCommand
 from django.db import connection as django_connection

-from awx.main.dispatch import get_local_queuename, reaper
+from awx.main.dispatch import get_local_queuename
 from awx.main.dispatch.control import Control
 from awx.main.dispatch.pool import AutoscalePool
 from awx.main.dispatch.worker import AWXConsumerPG, TaskWorker
@@ -30,7 +31,16 @@ class Command(BaseCommand):
            '--reload',
            dest='reload',
            action='store_true',
-            help=('cause the dispatcher to recycle all of its worker processes;' 'running jobs will run to completion first'),
+            help=('cause the dispatcher to recycle all of its worker processes; running jobs will run to completion first'),
+        )
+        parser.add_argument(
+            '--cancel',
+            dest='cancel',
+            help=(
+                'Cancel a particular task id. Takes either a single id string, or a JSON list of multiple ids. '
+                'Can take in output from the --running argument as input to cancel all tasks. '
+                'Only running tasks can be canceled, queued tasks must be started before they can be canceled.'
+            ),
        )

    def handle(self, *arg, **options):
@@ -42,6 +52,16 @@ class Command(BaseCommand):
            return
        if options.get('reload'):
            return Control('dispatcher').control({'control': 'reload'})
+        if options.get('cancel'):
+            cancel_str = options.get('cancel')
+            try:
+                cancel_data = yaml.safe_load(cancel_str)
+            except Exception:
+                cancel_data = [cancel_str]
+            if not isinstance(cancel_data, list):
+                cancel_data = [cancel_str]
+            print(Control('dispatcher').cancel(cancel_data))
+            return

        # It's important to close these because we're _about_ to fork, and we
        # don't want the forked processes to inherit the open sockets
@@ -53,7 +73,6 @@ class Command(BaseCommand):
        # (like the node heartbeat)
        periodic.run_continuously()

-        reaper.startup_reaping()
        consumer = None

        try:
--- a/awx/main/management/commands/run_wsbroadcast.py
+++ b/awx/main/management/commands/run_wsbroadcast.py
@@ -95,8 +95,13 @@ class Command(BaseCommand):
        # database migrations are still running
        from awx.main.models.ha import Instance

-        executor = MigrationExecutor(connection)
-        migrating = bool(executor.migration_plan(executor.loader.graph.leaf_nodes()))
+        try:
+            executor = MigrationExecutor(connection)
+            migrating = bool(executor.migration_plan(executor.loader.graph.leaf_nodes()))
+        except Exception as exc:
+            logger.info(f'Error on startup of run_wsbroadcast (error: {exc}), retry in 10s...')
+            time.sleep(10)
+            return

        # In containerized deployments, migrations happen in the task container,
        # and the services running there don't start until migrations are
--- a/awx/main/managers.py
+++ b/awx/main/managers.py
@@ -129,13 +129,10 @@ class InstanceManager(models.Manager):
                # if instance was not retrieved by uuid and hostname was, use the hostname
                instance = self.filter(hostname=hostname)

-            from awx.main.models import Instance
-
            # Return existing instance
            if instance.exists():
                instance = instance.first()  # in the unusual occasion that there is more than one, only get one
-                instance.node_state = Instance.States.INSTALLED  # Wait for it to show up on the mesh
-                update_fields = ['node_state']
+                update_fields = []
                # if instance was retrieved by uuid and hostname has changed, update hostname
                if instance.hostname != hostname:
                    logger.warning("passed in hostname {0} is different from the original hostname {1}, updating to {0}".format(hostname, instance.hostname))
@@ -144,7 +141,6 @@ class InstanceManager(models.Manager):
                # if any other fields are to be updated
                if instance.ip_address != ip_address:
                    instance.ip_address = ip_address
-                    update_fields.append('ip_address')
                if instance.node_type != node_type:
                    instance.node_type = node_type
                    update_fields.append('node_type')
@@ -155,12 +151,12 @@ class InstanceManager(models.Manager):
                    return (False, instance)

            # Create new instance, and fill in default values
-            create_defaults = {'node_state': Instance.States.INSTALLED, 'capacity': 0}
+            create_defaults = dict(capacity=0)
            if defaults is not None:
                create_defaults.update(defaults)
            uuid_option = {}
            if uuid is not None:
-                uuid_option = {'uuid': uuid}
+                uuid_option = dict(uuid=uuid)
            if node_type == 'execution' and 'version' not in create_defaults:
                create_defaults['version'] = RECEPTOR_PENDING
            instance = self.create(hostname=hostname, ip_address=ip_address, node_type=node_type, **create_defaults, **uuid_option)
--- a/awx/main/migrations/0165_node_and_link_state.py
+++ b/awx/main/migrations/0165_node_and_link_state.py
@@ -1,79 +0,0 @@
-# Generated by Django 3.2.13 on 2022-08-02 17:53
-
-import django.core.validators
-from django.db import migrations, models
-
-
-def forwards(apps, schema_editor):
-    # All existing InstanceLink objects need to be in the state
-    # 'Established', which is the default, so nothing needs to be done
-    # for that.
-
-    Instance = apps.get_model('main', 'Instance')
-    for instance in Instance.objects.all():
-        instance.node_state = 'ready' if not instance.errors else 'unavailable'
-        instance.save(update_fields=['node_state'])
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        ('main', '0164_remove_inventorysource_update_on_project_update'),
-    ]
-
-    operations = [
-        migrations.AddField(
-            model_name='instance',
-            name='listener_port',
-            field=models.PositiveIntegerField(
-                blank=True,
-                default=27199,
-                help_text='Port that Receptor will listen for incoming connections on.',
-                validators=[django.core.validators.MinValueValidator(1), django.core.validators.MaxValueValidator(65535)],
-            ),
-        ),
-        migrations.AddField(
-            model_name='instance',
-            name='node_state',
-            field=models.CharField(
-                choices=[
-                    ('provisioning', 'Provisioning'),
-                    ('provision-fail', 'Provisioning Failure'),
-                    ('installed', 'Installed'),
-                    ('ready', 'Ready'),
-                    ('unavailable', 'Unavailable'),
-                    ('deprovisioning', 'De-provisioning'),
-                    ('deprovision-fail', 'De-provisioning Failure'),
-                ],
-                default='ready',
-                help_text='Indicates the current life cycle stage of this instance.',
-                max_length=16,
-            ),
-        ),
-        migrations.AddField(
-            model_name='instancelink',
-            name='link_state',
-            field=models.CharField(
-                choices=[('adding', 'Adding'), ('established', 'Established'), ('removing', 'Removing')],
-                default='established',
-                help_text='Indicates the current life cycle stage of this peer link.',
-                max_length=16,
-            ),
-        ),
-        migrations.AlterField(
-            model_name='instance',
-            name='node_type',
-            field=models.CharField(
-                choices=[
-                    ('control', 'Control plane node'),
-                    ('execution', 'Execution plane node'),
-                    ('hybrid', 'Controller and execution'),
-                    ('hop', 'Message-passing node, no execution capability'),
-                ],
-                default='hybrid',
-                help_text='Role that this node plays in the mesh.',
-                max_length=16,
-            ),
-        ),
-        migrations.RunPython(forwards, reverse_code=migrations.RunPython.noop),
-    ]
--- a/awx/main/migrations/0165_task_manager_refactor.py
+++ b/awx/main/migrations/0165_task_manager_refactor.py
@@ -0,0 +1,35 @@
+# Generated by Django 3.2.13 on 2022-08-10 14:03
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('main', '0164_remove_inventorysource_update_on_project_update'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='unifiedjob',
+            name='preferred_instance_groups_cache',
+            field=models.JSONField(
+                blank=True, default=None, editable=False, help_text='A cached list with pk values from preferred instance groups.', null=True
+            ),
+        ),
+        migrations.AddField(
+            model_name='unifiedjob',
+            name='task_impact',
+            field=models.PositiveIntegerField(default=0, editable=False, help_text='Number of forks an instance consumes when running this job.'),
+        ),
+        migrations.AddField(
+            model_name='workflowapproval',
+            name='expires',
+            field=models.DateTimeField(
+                default=None,
+                editable=False,
+                help_text='The time this approval will expire. This is the created time plus timeout, used for filtering.',
+                null=True,
+            ),
+        ),
+    ]
--- a/awx/main/migrations/0166_alter_jobevent_host.py
+++ b/awx/main/migrations/0166_alter_jobevent_host.py
@@ -0,0 +1,40 @@
+# Generated by Django 3.2.13 on 2022-07-06 13:19
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('main', '0165_task_manager_refactor'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='adhoccommandevent',
+            name='host',
+            field=models.ForeignKey(
+                db_constraint=False,
+                default=None,
+                editable=False,
+                null=True,
+                on_delete=django.db.models.deletion.SET_NULL,
+                related_name='ad_hoc_command_events',
+                to='main.host',
+            ),
+        ),
+        migrations.AlterField(
+            model_name='jobevent',
+            name='host',
+            field=models.ForeignKey(
+                db_constraint=False,
+                default=None,
+                editable=False,
+                null=True,
+                on_delete=django.db.models.deletion.DO_NOTHING,
+                related_name='job_events_as_primary_host',
+                to='main.host',
+            ),
+        ),
+    ]
--- a/awx/main/migrations/0167_project_signature_validation_credential.py
+++ b/awx/main/migrations/0167_project_signature_validation_credential.py
@@ -0,0 +1,57 @@
+# Generated by Django 3.2.13 on 2022-08-24 14:02
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+from awx.main.models import CredentialType
+from awx.main.utils.common import set_current_apps
+
+
+def setup_tower_managed_defaults(apps, schema_editor):
+    set_current_apps(apps)
+    CredentialType.setup_tower_managed_defaults(apps)
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('main', '0166_alter_jobevent_host'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='project',
+            name='signature_validation_credential',
+            field=models.ForeignKey(
+                blank=True,
+                default=None,
+                null=True,
+                on_delete=django.db.models.deletion.SET_NULL,
+                related_name='projects_signature_validation',
+                to='main.credential',
+                help_text='An optional credential used for validating files in the project against unexpected changes.',
+            ),
+        ),
+        migrations.AlterField(
+            model_name='credentialtype',
+            name='kind',
+            field=models.CharField(
+                choices=[
+                    ('ssh', 'Machine'),
+                    ('vault', 'Vault'),
+                    ('net', 'Network'),
+                    ('scm', 'Source Control'),
+                    ('cloud', 'Cloud'),
+                    ('registry', 'Container Registry'),
+                    ('token', 'Personal Access Token'),
+                    ('insights', 'Insights'),
+                    ('external', 'External'),
+                    ('kubernetes', 'Kubernetes'),
+                    ('galaxy', 'Galaxy/Automation Hub'),
+                    ('cryptography', 'Cryptography'),
+                ],
+                max_length=32,
+            ),
+        ),
+        migrations.RunPython(setup_tower_managed_defaults),
+    ]
--- a/awx/main/migrations/_create_system_jobs.py
+++ b/awx/main/migrations/_create_system_jobs.py
@@ -36,7 +36,7 @@ def create_clearsessions_jt(apps, schema_editor):
    if created:
        sched = Schedule(
            name='Cleanup Expired Sessions',
-            rrule='DTSTART:%s RRULE:FREQ=WEEKLY;INTERVAL=1;COUNT=1' % schedule_time,
+            rrule='DTSTART:%s RRULE:FREQ=WEEKLY;INTERVAL=1' % schedule_time,
            description='Cleans out expired browser sessions',
            enabled=True,
            created=now_dt,
@@ -69,7 +69,7 @@ def create_cleartokens_jt(apps, schema_editor):
    if created:
        sched = Schedule(
            name='Cleanup Expired OAuth 2 Tokens',
-            rrule='DTSTART:%s RRULE:FREQ=WEEKLY;INTERVAL=1;COUNT=1' % schedule_time,
+            rrule='DTSTART:%s RRULE:FREQ=WEEKLY;INTERVAL=1' % schedule_time,
            description='Removes expired OAuth 2 access and refresh tokens',
            enabled=True,
            created=now_dt,
--- a/awx/main/models/ad_hoc_commands.py
+++ b/awx/main/models/ad_hoc_commands.py
@@ -90,6 +90,9 @@ class AdHocCommand(UnifiedJob, JobNotificationMixin):

    extra_vars_dict = VarsDictProperty('extra_vars', True)

+    def _set_default_dependencies_processed(self):
+        self.dependencies_processed = True
+
    def clean_inventory(self):
        inv = self.inventory
        if not inv:
@@ -178,12 +181,12 @@ class AdHocCommand(UnifiedJob, JobNotificationMixin):
    def get_passwords_needed_to_start(self):
        return self.passwords_needed_to_start

-    @property
-    def task_impact(self):
+    def _get_task_impact(self):
        # NOTE: We sorta have to assume the host count matches and that forks default to 5
-        from awx.main.models.inventory import Host
-
-        count_hosts = Host.objects.filter(enabled=True, inventory__ad_hoc_commands__pk=self.pk).count()
+        if self.inventory:
+            count_hosts = self.inventory.total_hosts
+        else:
+            count_hosts = 5
        return min(count_hosts, 5 if self.forks == 0 else self.forks) + 1

    def copy(self):
@@ -207,10 +210,20 @@ class AdHocCommand(UnifiedJob, JobNotificationMixin):

    def save(self, *args, **kwargs):
        update_fields = kwargs.get('update_fields', [])
+
+        def add_to_update_fields(name):
+            if name not in update_fields:
+                update_fields.append(name)
+
+        if not self.preferred_instance_groups_cache:
+            self.preferred_instance_groups_cache = self._get_preferred_instance_group_cache()
+            add_to_update_fields("preferred_instance_groups_cache")
        if not self.name:
            self.name = Truncator(u': '.join(filter(None, (self.module_name, self.module_args)))).chars(512)
-            if 'name' not in update_fields:
-                update_fields.append('name')
+            add_to_update_fields("name")
+        if self.task_impact == 0:
+            self.task_impact = self._get_task_impact()
+            add_to_update_fields("task_impact")
        super(AdHocCommand, self).save(*args, **kwargs)

    @property
--- a/awx/main/models/base.py
+++ b/awx/main/models/base.py
@@ -316,16 +316,17 @@ class PrimordialModel(HasEditsMixin, CreatedModifiedModel):
        user = get_current_user()
        if user and not user.id:
            user = None
-        if not self.pk and not self.created_by:
+        if (not self.pk) and (user is not None) and (not self.created_by):
            self.created_by = user
            if 'created_by' not in update_fields:
                update_fields.append('created_by')
        # Update modified_by if any editable fields have changed
        new_values = self._get_fields_snapshot()
        if (not self.pk and not self.modified_by) or self._values_have_edits(new_values):
-            self.modified_by = user
-            if 'modified_by' not in update_fields:
-                update_fields.append('modified_by')
+            if self.modified_by != user:
+                self.modified_by = user
+                if 'modified_by' not in update_fields:
+                    update_fields.append('modified_by')
        super(PrimordialModel, self).save(*args, **kwargs)
        self._prior_values_store = new_values

--- a/awx/main/models/credential/init.py
+++ b/awx/main/models/credential/init.py
@@ -336,6 +336,7 @@ class CredentialType(CommonModelNameNotUnique):
        ('external', _('External')),
        ('kubernetes', _('Kubernetes')),
        ('galaxy', _('Galaxy/Automation Hub')),
+        ('cryptography', _('Cryptography')),
    )

    kind = models.CharField(max_length=32, choices=KIND_CHOICES)
@@ -1171,6 +1172,25 @@ ManagedCredentialType(
    },
 )

+ManagedCredentialType(
+    namespace='gpg_public_key',
+    kind='cryptography',
+    name=gettext_noop('GPG Public Key'),
+    inputs={
+        'fields': [
+            {
+                'id': 'gpg_public_key',
+                'label': gettext_noop('GPG Public Key'),
+                'type': 'string',
+                'secret': True,
+                'multiline': True,
+                'help_text': gettext_noop('GPG Public Key used to validate content signatures.'),
+            },
+        ],
+        'required': ['gpg_public_key'],
+    },
+)
+

 class CredentialInputSource(PrimordialModel):
    class Meta:
--- a/awx/main/models/events.py
+++ b/awx/main/models/events.py
@@ -25,7 +25,6 @@ analytics_logger = logging.getLogger('awx.analytics.job_events')

 logger = logging.getLogger('awx.main.models.events')

-
 __all__ = ['JobEvent', 'ProjectUpdateEvent', 'AdHocCommandEvent', 'InventoryUpdateEvent', 'SystemJobEvent']


@@ -486,13 +485,18 @@ class JobEvent(BasePlaybookEvent):
        editable=False,
        db_index=False,
    )
+    # When we partitioned the table we accidentally "lost" the foreign key constraint.
+    # However this is good because the cascade on delete at the django layer was causing DB issues
+    # We are going to leave this as a foreign key but mark it as not having a DB relation and
+    #  prevent cascading on delete.
    host = models.ForeignKey(
        'Host',
        related_name='job_events_as_primary_host',
        null=True,
        default=None,
-        on_delete=models.SET_NULL,
+        on_delete=models.DO_NOTHING,
        editable=False,
+        db_constraint=False,
    )
    host_name = models.CharField(
        max_length=1024,
@@ -794,6 +798,10 @@ class AdHocCommandEvent(BaseCommandEvent):
        editable=False,
        db_index=False,
    )
+    # We need to keep this as a FK in the model because AdHocCommand uses a ManyToMany field
+    #   to hosts through adhoc_events. But in https://github.com/ansible/awx/pull/8236/ we
+    #   removed the nulling of the field in case of a host going away before an event is saved
+    #   so this needs to stay SET_NULL on the ORM level
    host = models.ForeignKey(
        'Host',
        related_name='ad_hoc_command_events',
@@ -801,6 +809,7 @@ class AdHocCommandEvent(BaseCommandEvent):
        default=None,
        on_delete=models.SET_NULL,
        editable=False,
+        db_constraint=False,
    )
    host_name = models.CharField(
        max_length=1024,
--- a/awx/main/models/ha.py
+++ b/awx/main/models/ha.py
@@ -5,13 +5,14 @@ from decimal import Decimal
 import logging
 import os

-from django.core.validators import MinValueValidator, MaxValueValidator
+from django.core.validators import MinValueValidator
 from django.db import models, connection
 from django.db.models.signals import post_save, post_delete
 from django.dispatch import receiver
 from django.utils.translation import gettext_lazy as _
 from django.conf import settings
 from django.utils.timezone import now, timedelta
+from django.db.models import Sum

 import redis
 from solo.models import SingletonModel
@@ -58,15 +59,6 @@ class InstanceLink(BaseModel):
    source = models.ForeignKey('Instance', on_delete=models.CASCADE, related_name='+')
    target = models.ForeignKey('Instance', on_delete=models.CASCADE, related_name='reverse_peers')

-    class States(models.TextChoices):
-        ADDING = 'adding', _('Adding')
-        ESTABLISHED = 'established', _('Established')
-        REMOVING = 'removing', _('Removing')
-
-    link_state = models.CharField(
-        choices=States.choices, default=States.ESTABLISHED, max_length=16, help_text=_("Indicates the current life cycle stage of this peer link.")
-    )
-
    class Meta:
        unique_together = ('source', 'target')

@@ -135,33 +127,13 @@ class Instance(HasPolicyEditsMixin, BaseModel):
        default=0,
        editable=False,
    )
-
-    class Types(models.TextChoices):
-        CONTROL = 'control', _("Control plane node")
-        EXECUTION = 'execution', _("Execution plane node")
-        HYBRID = 'hybrid', _("Controller and execution")
-        HOP = 'hop', _("Message-passing node, no execution capability")
-
-    node_type = models.CharField(default=Types.HYBRID, choices=Types.choices, max_length=16, help_text=_("Role that this node plays in the mesh."))
-
-    class States(models.TextChoices):
-        PROVISIONING = 'provisioning', _('Provisioning')
-        PROVISION_FAIL = 'provision-fail', _('Provisioning Failure')
-        INSTALLED = 'installed', _('Installed')
-        READY = 'ready', _('Ready')
-        UNAVAILABLE = 'unavailable', _('Unavailable')
-        DEPROVISIONING = 'deprovisioning', _('De-provisioning')
-        DEPROVISION_FAIL = 'deprovision-fail', _('De-provisioning Failure')
-
-    node_state = models.CharField(
-        choices=States.choices, default=States.READY, max_length=16, help_text=_("Indicates the current life cycle stage of this instance.")
-    )
-    listener_port = models.PositiveIntegerField(
-        blank=True,
-        default=27199,
-        validators=[MinValueValidator(1), MaxValueValidator(65535)],
-        help_text=_("Port that Receptor will listen for incoming connections on."),
-    )
+    NODE_TYPE_CHOICES = [
+        ("control", "Control plane node"),
+        ("execution", "Execution plane node"),
+        ("hybrid", "Controller and execution"),
+        ("hop", "Message-passing node, no execution capability"),
+    ]
+    node_type = models.CharField(default='hybrid', choices=NODE_TYPE_CHOICES, max_length=16)

    peers = models.ManyToManyField('self', symmetrical=False, through=InstanceLink, through_fields=('source', 'target'))

@@ -178,10 +150,13 @@ class Instance(HasPolicyEditsMixin, BaseModel):
    def consumed_capacity(self):
        capacity_consumed = 0
        if self.node_type in ('hybrid', 'execution'):
-            capacity_consumed += sum(x.task_impact for x in UnifiedJob.objects.filter(execution_node=self.hostname, status__in=('running', 'waiting')))
+            capacity_consumed += (
+                UnifiedJob.objects.filter(execution_node=self.hostname, status__in=('running', 'waiting')).aggregate(Sum("task_impact"))["task_impact__sum"]
+                or 0
+            )
        if self.node_type in ('hybrid', 'control'):
-            capacity_consumed += sum(
-                settings.AWX_CONTROL_NODE_TASK_IMPACT for x in UnifiedJob.objects.filter(controller_node=self.hostname, status__in=('running', 'waiting'))
+            capacity_consumed += (
+                settings.AWX_CONTROL_NODE_TASK_IMPACT * UnifiedJob.objects.filter(controller_node=self.hostname, status__in=('running', 'waiting')).count()
            )
        return capacity_consumed

@@ -232,24 +207,21 @@ class Instance(HasPolicyEditsMixin, BaseModel):
            return True
        if ref_time is None:
            ref_time = now()
-        grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * 2
+        grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * settings.CLUSTER_NODE_MISSED_HEARTBEAT_TOLERANCE
        if self.node_type in ('execution', 'hop'):
            grace_period += settings.RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD
        return self.last_seen < ref_time - timedelta(seconds=grace_period)

    def mark_offline(self, update_last_seen=False, perform_save=True, errors=''):
-        if self.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
+        if self.cpu_capacity == 0 and self.mem_capacity == 0 and self.capacity == 0 and self.errors == errors and (not update_last_seen):
            return
-        if self.node_state == Instance.States.UNAVAILABLE and self.errors == errors and (not update_last_seen):
-            return
-        self.node_state = Instance.States.UNAVAILABLE
        self.cpu_capacity = self.mem_capacity = self.capacity = 0
        self.errors = errors
        if update_last_seen:
            self.last_seen = now()

        if perform_save:
-            update_fields = ['node_state', 'capacity', 'cpu_capacity', 'mem_capacity', 'errors']
+            update_fields = ['capacity', 'cpu_capacity', 'mem_capacity', 'errors']
            if update_last_seen:
                update_fields += ['last_seen']
            self.save(update_fields=update_fields)
@@ -306,9 +278,6 @@ class Instance(HasPolicyEditsMixin, BaseModel):
        if not errors:
            self.refresh_capacity_fields()
            self.errors = ''
-            if self.node_state in (Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
-                self.node_state = Instance.States.READY
-                update_fields.append('node_state')
        else:
            self.mark_offline(perform_save=False, errors=errors)
        update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity'])
@@ -327,7 +296,7 @@ class Instance(HasPolicyEditsMixin, BaseModel):
            # playbook event data; we should consider this a zero capacity event
            redis.Redis.from_url(settings.BROKER_URL).ping()
        except redis.ConnectionError:
-            errors = _('Failed to connect to Redis')
+            errors = _('Failed to connect ot Redis')

        self.save_health_data(awx_application_version, get_cpu_count(), get_mem_in_bytes(), update_last_seen=True, errors=errors)

--- a/awx/main/models/inventory.py
+++ b/awx/main/models/inventory.py
@@ -236,6 +236,12 @@ class Inventory(CommonModelNameNotUnique, ResourceMixin, RelatedJobsMixin):
            raise ParseError(_('Slice number must be 1 or higher.'))
        return (number, step)

+    def get_sliced_hosts(self, host_queryset, slice_number, slice_count):
+        if slice_count > 1 and slice_number > 0:
+            offset = slice_number - 1
+            host_queryset = host_queryset[offset::slice_count]
+        return host_queryset
+
    def get_script_data(self, hostvars=False, towervars=False, show_all=False, slice_number=1, slice_count=1):
        hosts_kw = dict()
        if not show_all:
@@ -243,10 +249,8 @@ class Inventory(CommonModelNameNotUnique, ResourceMixin, RelatedJobsMixin):
        fetch_fields = ['name', 'id', 'variables', 'inventory_id']
        if towervars:
            fetch_fields.append('enabled')
-        hosts = self.hosts.filter(**hosts_kw).order_by('name').only(*fetch_fields)
-        if slice_count > 1 and slice_number > 0:
-            offset = slice_number - 1
-            hosts = hosts[offset::slice_count]
+        host_queryset = self.hosts.filter(**hosts_kw).order_by('name').only(*fetch_fields)
+        hosts = self.get_sliced_hosts(host_queryset, slice_number, slice_count)

        data = dict()
        all_group = data.setdefault('all', dict())
@@ -337,9 +341,12 @@ class Inventory(CommonModelNameNotUnique, ResourceMixin, RelatedJobsMixin):
        else:
            active_inventory_sources = self.inventory_sources.filter(source__in=CLOUD_INVENTORY_SOURCES)
        failed_inventory_sources = active_inventory_sources.filter(last_job_failed=True)
+        total_hosts = active_hosts.count()
+        # if total_hosts has changed, set update_task_impact to True
+        update_task_impact = total_hosts != self.total_hosts
        computed_fields = {
            'has_active_failures': bool(failed_hosts.count()),
-            'total_hosts': active_hosts.count(),
+            'total_hosts': total_hosts,
            'hosts_with_active_failures': failed_hosts.count(),
            'total_groups': active_groups.count(),
            'has_inventory_sources': bool(active_inventory_sources.count()),
@@ -357,6 +364,14 @@ class Inventory(CommonModelNameNotUnique, ResourceMixin, RelatedJobsMixin):
                computed_fields.pop(field)
        if computed_fields:
            iobj.save(update_fields=computed_fields.keys())
+        if update_task_impact:
+            # if total hosts count has changed, re-calculate task_impact for any
+            # job that is still in pending for this inventory, since task_impact
+            # is cached on task creation and used in task management system
+            tasks = self.jobs.filter(status="pending")
+            for t in tasks:
+                t.task_impact = t._get_task_impact()
+            UnifiedJob.objects.bulk_update(tasks, ['task_impact'])
        logger.debug("Finished updating inventory computed fields, pk={0}, in " "{1:.3f} seconds".format(self.pk, time.time() - start_time))

    def websocket_emit_status(self, status):
@@ -1220,8 +1235,7 @@ class InventoryUpdate(UnifiedJob, InventorySourceOptions, JobNotificationMixin,
            return UnpartitionedInventoryUpdateEvent
        return InventoryUpdateEvent

-    @property
-    def task_impact(self):
+    def _get_task_impact(self):
        return 1

    # InventoryUpdate credential required
--- a/awx/main/models/jobs.py
+++ b/awx/main/models/jobs.py
@@ -600,6 +600,19 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana
    def get_ui_url(self):
        return urljoin(settings.TOWER_URL_BASE, "/#/jobs/playbook/{}".format(self.pk))

+    def _set_default_dependencies_processed(self):
+        """
+        This sets the initial value of dependencies_processed
+        and here we use this as a shortcut to avoid the DependencyManager for jobs that do not need it
+        """
+        if (not self.project) or self.project.scm_update_on_launch:
+            self.dependencies_processed = False
+        elif (not self.inventory) or self.inventory.inventory_sources.filter(update_on_launch=True).exists():
+            self.dependencies_processed = False
+        else:
+            # No dependencies to process
+            self.dependencies_processed = True
+
    @property
    def event_class(self):
        if self.has_unpartitioned_events:
@@ -644,8 +657,7 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana
            raise ParseError(_('{status_value} is not a valid status option.').format(status_value=status))
        return self._get_hosts(**kwargs)

-    @property
-    def task_impact(self):
+    def _get_task_impact(self):
        if self.launch_type == 'callback':
            count_hosts = 2
        else:
@@ -802,7 +814,8 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana
    def _get_inventory_hosts(self, only=['name', 'ansible_facts', 'ansible_facts_modified', 'modified', 'inventory_id']):
        if not self.inventory:
            return []
-        return self.inventory.hosts.only(*only)
+        host_queryset = self.inventory.hosts.only(*only)
+        return self.inventory.get_sliced_hosts(host_queryset, self.job_slice_number, self.job_slice_count)

    def start_job_fact_cache(self, destination, modification_times, timeout=None):
        self.log_lifecycle("start_job_fact_cache")
@@ -847,7 +860,7 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana
                            continue
                        host.ansible_facts = ansible_facts
                        host.ansible_facts_modified = now()
-                        host.save()
+                        host.save(update_fields=['ansible_facts', 'ansible_facts_modified'])
                        system_tracking_logger.info(
                            'New fact for inventory {} host {}'.format(smart_str(host.inventory.name), smart_str(host.name)),
                            extra=dict(
@@ -1213,6 +1226,9 @@ class SystemJob(UnifiedJob, SystemJobOptions, JobNotificationMixin):

    extra_vars_dict = VarsDictProperty('extra_vars', True)

+    def _set_default_dependencies_processed(self):
+        self.dependencies_processed = True
+
    @classmethod
    def _get_parent_field_name(cls):
        return 'system_job_template'
@@ -1238,8 +1254,7 @@ class SystemJob(UnifiedJob, SystemJobOptions, JobNotificationMixin):
            return UnpartitionedSystemJobEvent
        return SystemJobEvent

-    @property
-    def task_impact(self):
+    def _get_task_impact(self):
        return 5

    @property
--- a/awx/main/models/mixins.py
+++ b/awx/main/models/mixins.py
@@ -412,6 +412,11 @@ class TaskManagerJobMixin(TaskManagerUnifiedJobMixin):
    class Meta:
        abstract = True

+    def get_jobs_fail_chain(self):
+        if self.project_update_id:
+            return [self.project_update]
+        return []
+

 class TaskManagerUpdateOnLaunchMixin(TaskManagerUnifiedJobMixin):
    class Meta:
--- a/awx/main/models/projects.py
+++ b/awx/main/models/projects.py
@@ -284,6 +284,17 @@ class Project(UnifiedJobTemplate, ProjectOptions, ResourceMixin, CustomVirtualEn
        help_text=_('Allow changing the SCM branch or revision in a job template ' 'that uses this project.'),
    )

+    # credential (keys) used to validate content signature
+    signature_validation_credential = models.ForeignKey(
+        'Credential',
+        related_name='%(class)ss_signature_validation',
+        blank=True,
+        null=True,
+        default=None,
+        on_delete=models.SET_NULL,
+        help_text=_('An optional credential used for validating files in the project against unexpected changes.'),
+    )
+
    scm_revision = models.CharField(
        max_length=1024,
        blank=True,
@@ -513,6 +524,9 @@ class ProjectUpdate(UnifiedJob, ProjectOptions, JobNotificationMixin, TaskManage
        help_text=_('The SCM Revision discovered by this update for the given project and branch.'),
    )

+    def _set_default_dependencies_processed(self):
+        self.dependencies_processed = True
+
    def _get_parent_field_name(self):
        return 'project'

@@ -560,8 +574,7 @@ class ProjectUpdate(UnifiedJob, ProjectOptions, JobNotificationMixin, TaskManage
            return UnpartitionedProjectUpdateEvent
        return ProjectUpdateEvent

-    @property
-    def task_impact(self):
+    def _get_task_impact(self):
        return 0 if self.job_type == 'run' else 1

    @property
@@ -618,6 +631,10 @@ class ProjectUpdate(UnifiedJob, ProjectOptions, JobNotificationMixin, TaskManage
        added_update_fields = []
        if not self.job_tags:
            job_tags = ['update_{}'.format(self.scm_type), 'install_roles', 'install_collections']
+            if self.project.signature_validation_credential is not None:
+                credential_type = self.project.signature_validation_credential.credential_type.namespace
+                job_tags.append(f'validation_{credential_type}')
+                job_tags.append('validation_checksum_manifest')
            self.job_tags = ','.join(job_tags)
            added_update_fields.append('job_tags')
        if self.scm_delete_on_update and 'delete' not in self.job_tags and self.job_type == 'check':
--- a/awx/main/models/unified_jobs.py
+++ b/awx/main/models/unified_jobs.py
@@ -45,7 +45,8 @@ from awx.main.utils.common import (
    get_type_for_model,
    parse_yaml_or_json,
    getattr_dne,
-    schedule_task_manager,
+    ScheduleDependencyManager,
+    ScheduleTaskManager,
    get_event_partition_epoch,
    get_capacity_type,
 )
@@ -381,6 +382,11 @@ class UnifiedJobTemplate(PolymorphicModel, CommonModelNameNotUnique, ExecutionEn
            unified_job.survey_passwords = new_job_passwords
            kwargs['survey_passwords'] = new_job_passwords  # saved in config object for relaunch

+        unified_job.preferred_instance_groups_cache = unified_job._get_preferred_instance_group_cache()
+
+        unified_job._set_default_dependencies_processed()
+        unified_job.task_impact = unified_job._get_task_impact()
+
        from awx.main.signals import disable_activity_stream, activity_stream_create

        with disable_activity_stream():
@@ -693,6 +699,14 @@ class UnifiedJob(
        on_delete=polymorphic.SET_NULL,
        help_text=_('The Instance group the job was run under'),
    )
+    preferred_instance_groups_cache = models.JSONField(
+        blank=True,
+        null=True,
+        default=None,
+        editable=False,
+        help_text=_("A cached list with pk values from preferred instance groups."),
+    )
+    task_impact = models.PositiveIntegerField(default=0, editable=False, help_text=_("Number of forks an instance consumes when running this job."))
    organization = models.ForeignKey(
        'Organization',
        blank=True,
@@ -754,6 +768,9 @@ class UnifiedJob(
    def _get_parent_field_name(self):
        return 'unified_job_template'  # Override in subclasses.

+    def _get_preferred_instance_group_cache(self):
+        return [ig.pk for ig in self.preferred_instance_groups]
+
    @classmethod
    def _get_unified_job_template_class(cls):
        """
@@ -808,6 +825,9 @@ class UnifiedJob(
            update_fields = self._update_parent_instance_no_save(parent_instance)
            parent_instance.save(update_fields=update_fields)

+    def _set_default_dependencies_processed(self):
+        pass
+
    def save(self, *args, **kwargs):
        """Save the job, with current status, to the database.
        Ensure that all data is consistent before doing so.
@@ -821,7 +841,8 @@ class UnifiedJob(

        # If this job already exists in the database, retrieve a copy of
        # the job in its prior state.
-        if self.pk:
+        # If update_fields are given without status, then that indicates no change
+        if self.pk and ((not update_fields) or ('status' in update_fields)):
            self_before = self.__class__.objects.get(pk=self.pk)
            if self_before.status != self.status:
                status_before = self_before.status
@@ -1026,7 +1047,6 @@ class UnifiedJob(
            event_qs = self.get_event_queryset()
        except NotImplementedError:
            return True  # Model without events, such as WFJT
-        self.log_lifecycle("event_processing_finished")
        return self.emitted_events == event_qs.count()

    def result_stdout_raw_handle(self, enforce_max_bytes=True):
@@ -1241,9 +1261,8 @@ class UnifiedJob(
        except JobLaunchConfig.DoesNotExist:
            return False

-    @property
-    def task_impact(self):
-        raise NotImplementedError  # Implement in subclass.
+    def _get_task_impact(self):
+        return self.task_impact  # return default, should implement in subclass.

    def websocket_emit_data(self):
        '''Return extra data that should be included when submitting data to the browser over the websocket connection'''
@@ -1255,7 +1274,7 @@ class UnifiedJob(
    def _websocket_emit_status(self, status):
        try:
            status_data = dict(unified_job_id=self.id, status=status)
-            if status == 'waiting':
+            if status == 'running':
                if self.instance_group:
                    status_data['instance_group_name'] = self.instance_group.name
                else:
@@ -1358,7 +1377,10 @@ class UnifiedJob(
        self.update_fields(start_args=json.dumps(kwargs), status='pending')
        self.websocket_emit_status("pending")

-        schedule_task_manager()
+        if self.dependencies_processed:
+            ScheduleTaskManager().schedule()
+        else:
+            ScheduleDependencyManager().schedule()

        # Each type of unified job has a different Task class; get the
        # appropirate one.
@@ -1373,22 +1395,6 @@ class UnifiedJob(
        # Done!
        return True

-    @property
-    def actually_running(self):
-        # returns True if the job is running in the appropriate dispatcher process
-        running = False
-        if all([self.status == 'running', self.celery_task_id, self.execution_node]):
-            # If the job is marked as running, but the dispatcher
-            # doesn't know about it (or the dispatcher doesn't reply),
-            # then cancel the job
-            timeout = 5
-            try:
-                running = self.celery_task_id in ControlDispatcher('dispatcher', self.controller_node or self.execution_node).running(timeout=timeout)
-            except (socket.timeout, RuntimeError):
-                logger.error('could not reach dispatcher on {} within {}s'.format(self.execution_node, timeout))
-                running = False
-        return running
-
    @property
    def can_cancel(self):
        return bool(self.status in CAN_CANCEL)
@@ -1398,27 +1404,61 @@ class UnifiedJob(
            return 'Previous Task Canceled: {"job_type": "%s", "job_name": "%s", "job_id": "%s"}' % (self.model_to_str(), self.name, self.id)
        return None

+    def fallback_cancel(self):
+        if not self.celery_task_id:
+            self.refresh_from_db(fields=['celery_task_id'])
+        self.cancel_dispatcher_process()
+
+    def cancel_dispatcher_process(self):
+        """Returns True if dispatcher running this job acknowledged request and sent SIGTERM"""
+        if not self.celery_task_id:
+            return
+        canceled = []
+        try:
+            # Use control and reply mechanism to cancel and obtain confirmation
+            timeout = 5
+            canceled = ControlDispatcher('dispatcher', self.controller_node).cancel([self.celery_task_id])
+        except socket.timeout:
+            logger.error(f'could not reach dispatcher on {self.controller_node} within {timeout}s')
+        except Exception:
+            logger.exception("error encountered when checking task status")
+        return bool(self.celery_task_id in canceled)  # True or False, whether confirmation was obtained
+
    def cancel(self, job_explanation=None, is_chain=False):
        if self.can_cancel:
            if not is_chain:
                for x in self.get_jobs_fail_chain():
                    x.cancel(job_explanation=self._build_job_explanation(), is_chain=True)

+            cancel_fields = []
            if not self.cancel_flag:
                self.cancel_flag = True
                self.start_args = ''  # blank field to remove encrypted passwords
-                cancel_fields = ['cancel_flag', 'start_args']
-                if self.status in ('pending', 'waiting', 'new'):
-                    self.status = 'canceled'
-                    cancel_fields.append('status')
-                if self.status == 'running' and not self.actually_running:
-                    self.status = 'canceled'
-                    cancel_fields.append('status')
+                cancel_fields.extend(['cancel_flag', 'start_args'])
+                connection.on_commit(lambda: self.websocket_emit_status("canceled"))
+
                if job_explanation is not None:
                    self.job_explanation = job_explanation
                    cancel_fields.append('job_explanation')
-                self.save(update_fields=cancel_fields)
-                self.websocket_emit_status("canceled")
+
+            controller_notified = False
+            if self.celery_task_id:
+                controller_notified = self.cancel_dispatcher_process()
+
+            else:
+                # Avoid race condition where we have stale model from pending state but job has already started,
+                # its checking signal but not cancel_flag, so re-send signal after this database commit
+                connection.on_commit(self.fallback_cancel)
+
+            # If a SIGTERM signal was sent to the control process, and acked by the dispatcher
+            # then we want to let its own cleanup change status, otherwise change status now
+            if not controller_notified:
+                if self.status != 'canceled':
+                    self.status = 'canceled'
+                    cancel_fields.append('status')
+
+            self.save(update_fields=cancel_fields)
+
        return self.cancel_flag

    @property
@@ -1515,8 +1555,8 @@ class UnifiedJob(
            'state': state,
            'work_unit_id': self.work_unit_id,
        }
-        if self.unified_job_template:
-            extra["template_name"] = self.unified_job_template.name
+        if self.name:
+            extra["task_name"] = self.name
        if state == "blocked" and blocked_by:
            blocked_by_msg = f"{blocked_by._meta.model_name}-{blocked_by.id}"
            msg = f"{self._meta.model_name}-{self.id} blocked by {blocked_by_msg}"
@@ -1528,7 +1568,7 @@ class UnifiedJob(
            extra["controller_node"] = self.controller_node or "NOT_SET"
        elif state == "execution_node_chosen":
            extra["execution_node"] = self.execution_node or "NOT_SET"
-        logger_job_lifecycle.debug(msg, extra=extra)
+        logger_job_lifecycle.info(msg, extra=extra)

    @property
    def launched_by(self):
--- a/awx/main/models/workflow.py
+++ b/awx/main/models/workflow.py
@@ -13,6 +13,7 @@ from django.db import connection, models
 from django.conf import settings
 from django.utils.translation import gettext_lazy as _
 from django.core.exceptions import ObjectDoesNotExist
+from django.utils.timezone import now, timedelta

 # from django import settings as tower_settings

@@ -40,7 +41,7 @@ from awx.main.models.mixins import (
 from awx.main.models.jobs import LaunchTimeConfigBase, LaunchTimeConfig, JobTemplate
 from awx.main.models.credential import Credential
 from awx.main.redact import REPLACE_STR
-from awx.main.utils import schedule_task_manager
+from awx.main.utils import ScheduleWorkflowManager


 __all__ = [
@@ -622,6 +623,9 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, SurveyJobMixin, JobNotificatio
    )
    is_sliced_job = models.BooleanField(default=False)

+    def _set_default_dependencies_processed(self):
+        self.dependencies_processed = True
+
    @property
    def workflow_nodes(self):
        return self.workflow_job_nodes
@@ -668,8 +672,7 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, SurveyJobMixin, JobNotificatio
        )
        return result

-    @property
-    def task_impact(self):
+    def _get_task_impact(self):
        return 0

    def get_ancestor_workflows(self):
@@ -720,11 +723,10 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, SurveyJobMixin, JobNotificatio
    def preferred_instance_groups(self):
        return []

-    @property
-    def actually_running(self):
+    def cancel_dispatcher_process(self):
        # WorkflowJobs don't _actually_ run anything in the dispatcher, so
        # there's no point in asking the dispatcher if it knows about this task
-        return self.status == 'running'
+        return True


 class WorkflowApprovalTemplate(UnifiedJobTemplate, RelatedJobsMixin):
@@ -783,6 +785,12 @@ class WorkflowApproval(UnifiedJob, JobNotificationMixin):
        default=0,
        help_text=_("The amount of time (in seconds) before the approval node expires and fails."),
    )
+    expires = models.DateTimeField(
+        default=None,
+        null=True,
+        editable=False,
+        help_text=_("The time this approval will expire. This is the created time plus timeout, used for filtering."),
+    )
    timed_out = models.BooleanField(default=False, help_text=_("Shows when an approval node (with a timeout assigned to it) has timed out."))
    approved_or_denied_by = models.ForeignKey(
        'auth.User',
@@ -793,6 +801,9 @@ class WorkflowApproval(UnifiedJob, JobNotificationMixin):
        on_delete=models.SET_NULL,
    )

+    def _set_default_dependencies_processed(self):
+        self.dependencies_processed = True
+
    @classmethod
    def _get_unified_job_template_class(cls):
        return WorkflowApprovalTemplate
@@ -810,13 +821,32 @@ class WorkflowApproval(UnifiedJob, JobNotificationMixin):
    def _get_parent_field_name(self):
        return 'workflow_approval_template'

+    def save(self, *args, **kwargs):
+        update_fields = list(kwargs.get('update_fields', []))
+        if self.timeout != 0 and ((not self.pk) or (not update_fields) or ('timeout' in update_fields)):
+            if not self.created:  # on creation, created will be set by parent class, so we fudge it here
+                created = now()
+            else:
+                created = self.created
+            new_expires = created + timedelta(seconds=self.timeout)
+            if new_expires != self.expires:
+                self.expires = new_expires
+                if update_fields and 'expires' not in update_fields:
+                    update_fields.append('expires')
+        elif self.timeout == 0 and ((not update_fields) or ('timeout' in update_fields)):
+            if self.expires:
+                self.expires = None
+                if update_fields and 'expires' not in update_fields:
+                    update_fields.append('expires')
+        super(WorkflowApproval, self).save(*args, **kwargs)
+
    def approve(self, request=None):
        self.status = 'successful'
        self.approved_or_denied_by = get_current_user()
        self.save()
        self.send_approval_notification('approved')
        self.websocket_emit_status(self.status)
-        schedule_task_manager()
+        ScheduleWorkflowManager().schedule()
        return reverse('api:workflow_approval_approve', kwargs={'pk': self.pk}, request=request)

    def deny(self, request=None):
@@ -825,7 +855,7 @@ class WorkflowApproval(UnifiedJob, JobNotificationMixin):
        self.save()
        self.send_approval_notification('denied')
        self.websocket_emit_status(self.status)
-        schedule_task_manager()
+        ScheduleWorkflowManager().schedule()
        return reverse('api:workflow_approval_deny', kwargs={'pk': self.pk}, request=request)

    def signal_start(self, **kwargs):
--- a/awx/main/scheduler/init.py
+++ b/awx/main/scheduler/init.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2017 Ansible, Inc.
 #

-from .task_manager import TaskManager
+from .task_manager import TaskManager, DependencyManager, WorkflowManager

-__all__ = ['TaskManager']
+__all__ = ['TaskManager', 'DependencyManager', 'WorkflowManager']
--- a/awx/main/scheduler/dependency_graph.py
+++ b/awx/main/scheduler/dependency_graph.py
@@ -7,6 +7,11 @@ from awx.main.models import (
    WorkflowJob,
 )

+import logging
+
+
+logger = logging.getLogger('awx.main.scheduler.dependency_graph')
+

 class DependencyGraph(object):
    PROJECT_UPDATES = 'project_updates'
@@ -36,6 +41,9 @@ class DependencyGraph(object):
        self.data[self.WORKFLOW_JOB_TEMPLATES_JOBS] = {}

    def mark_if_no_key(self, job_type, id, job):
+        if id is None:
+            logger.warning(f'Null dependency graph key from {job}, could be integrity error or bug, ignoring')
+            return
        # only mark first occurrence of a task. If 10 of JobA are launched
        # (concurrent disabled), the dependency graph should return that jobs
        # 2 through 10 are blocked by job1
@@ -66,7 +74,10 @@ class DependencyGraph(object):
        self.mark_if_no_key(self.JOB_TEMPLATE_JOBS, job.job_template_id, job)

    def mark_workflow_job(self, job):
-        self.mark_if_no_key(self.WORKFLOW_JOB_TEMPLATES_JOBS, job.workflow_job_template_id, job)
+        if job.workflow_job_template_id:
+            self.mark_if_no_key(self.WORKFLOW_JOB_TEMPLATES_JOBS, job.workflow_job_template_id, job)
+        elif job.unified_job_template_id:  # for sliced jobs
+            self.mark_if_no_key(self.WORKFLOW_JOB_TEMPLATES_JOBS, job.unified_job_template_id, job)

    def project_update_blocked_by(self, job):
        return self.get_item(self.PROJECT_UPDATES, job.project_id)
@@ -85,7 +96,13 @@ class DependencyGraph(object):

    def workflow_job_blocked_by(self, job):
        if job.allow_simultaneous is False:
-            return self.get_item(self.WORKFLOW_JOB_TEMPLATES_JOBS, job.workflow_job_template_id)
+            if job.workflow_job_template_id:
+                return self.get_item(self.WORKFLOW_JOB_TEMPLATES_JOBS, job.workflow_job_template_id)
+            elif job.unified_job_template_id:
+                # Sliced jobs can be either Job or WorkflowJob type, and either should block a sliced WorkflowJob
+                return self.get_item(self.WORKFLOW_JOB_TEMPLATES_JOBS, job.unified_job_template_id) or self.get_item(
+                    self.JOB_TEMPLATE_JOBS, job.unified_job_template_id
+                )
        return None

    def system_job_blocked_by(self, job):
--- a/awx/main/scheduler/task_manager.py
+++ b/awx/main/scheduler/task_manager.py
@@ -11,31 +11,35 @@ import sys
 import signal

 # Django
-from django.db import transaction, connection
+from django.db import transaction
 from django.utils.translation import gettext_lazy as _, gettext_noop
 from django.utils.timezone import now as tz_now
 from django.conf import settings
+from django.contrib.contenttypes.models import ContentType

 # AWX
 from awx.main.dispatch.reaper import reap_job
 from awx.main.models import (
-    AdHocCommand,
    Instance,
    InventorySource,
    InventoryUpdate,
    Job,
    Project,
    ProjectUpdate,
-    SystemJob,
    UnifiedJob,
    WorkflowApproval,
    WorkflowJob,
+    WorkflowJobNode,
    WorkflowJobTemplate,
 )
 from awx.main.scheduler.dag_workflow import WorkflowDAG
 from awx.main.utils.pglock import advisory_lock
-from awx.main.utils import get_type_for_model, task_manager_bulk_reschedule, schedule_task_manager
-from awx.main.utils.common import create_partition
+from awx.main.utils import (
+    get_type_for_model,
+    ScheduleTaskManager,
+    ScheduleWorkflowManager,
+)
+from awx.main.utils.common import task_manager_bulk_reschedule
 from awx.main.signals import disable_activity_stream
 from awx.main.constants import ACTIVE_STATES
 from awx.main.scheduler.dependency_graph import DependencyGraph
@@ -53,167 +57,101 @@ def timeit(func):
        t_now = time.perf_counter()
        result = func(*args, **kwargs)
        dur = time.perf_counter() - t_now
-        args[0].subsystem_metrics.inc("task_manager_" + func.__name__ + "_seconds", dur)
+        args[0].subsystem_metrics.inc(f"{args[0].prefix}_{func.__name__}_seconds", dur)
        return result

    return inner


-class TaskManager:
-    def __init__(self):
-        """
-        Do NOT put database queries or other potentially expensive operations
-        in the task manager init. The task manager object is created every time a
-        job is created, transitions state, and every 30 seconds on each tower node.
-        More often then not, the object is destroyed quickly because the NOOP case is hit.
-
-        The NOOP case is short-circuit logic. If the task manager realizes that another instance
-        of the task manager is already running, then it short-circuits and decides not to run.
-        """
-        # start task limit indicates how many pending jobs can be started on this
-        # .schedule() run. Starting jobs is expensive, and there is code in place to reap
-        # the task manager after 5 minutes. At scale, the task manager can easily take more than
-        # 5 minutes to start pending jobs. If this limit is reached, pending jobs
-        # will no longer be started and will be started on the next task manager cycle.
-        self.start_task_limit = settings.START_TASK_LIMIT
-        self.time_delta_job_explanation = timedelta(seconds=30)
-        self.subsystem_metrics = s_metrics.Metrics(auto_pipe_execute=False)
+class TaskBase:
+    def __init__(self, prefix=""):
+        self.prefix = prefix
        # initialize each metric to 0 and force metric_has_changed to true. This
        # ensures each task manager metric will be overridden when pipe_execute
        # is called later.
+        self.subsystem_metrics = s_metrics.Metrics(auto_pipe_execute=False)
+        self.start_time = time.time()
+        self.start_task_limit = settings.START_TASK_LIMIT
        for m in self.subsystem_metrics.METRICS:
-            if m.startswith("task_manager"):
+            if m.startswith(self.prefix):
                self.subsystem_metrics.set(m, 0)

-    def after_lock_init(self, all_sorted_tasks):
-        """
-        Init AFTER we know this instance of the task manager will run because the lock is acquired.
-        """
-        self.dependency_graph = DependencyGraph()
-        self.instances = TaskManagerInstances(all_sorted_tasks)
-        self.instance_groups = TaskManagerInstanceGroups(instances_by_hostname=self.instances)
-        self.controlplane_ig = self.instance_groups.controlplane_ig
-
-    def job_blocked_by(self, task):
-        # TODO: I'm not happy with this, I think blocking behavior should be decided outside of the dependency graph
-        #       in the old task manager this was handled as a method on each task object outside of the graph and
-        #       probably has the side effect of cutting down *a lot* of the logic from this task manager class
-        blocked_by = self.dependency_graph.task_blocked_by(task)
-        if blocked_by:
-            return blocked_by
-
-        for dep in task.dependent_jobs.all():
-            if dep.status in ACTIVE_STATES:
-                return dep
-            # if we detect a failed or error dependency, go ahead and fail this
-            # task. The errback on the dependency takes some time to trigger,
-            # and we don't want the task to enter running state if its
-            # dependency has failed or errored.
-            elif dep.status in ("error", "failed"):
-                task.status = 'failed'
-                task.job_explanation = 'Previous Task Failed: {"job_type": "%s", "job_name": "%s", "job_id": "%s"}' % (
-                    get_type_for_model(type(dep)),
-                    dep.name,
-                    dep.id,
-                )
-                task.save(update_fields=['status', 'job_explanation'])
-                task.websocket_emit_status('failed')
-                return dep
-
-        return None
+    def timed_out(self):
+        """Return True/False if we have met or exceeded the timeout for the task manager."""
+        elapsed = time.time() - self.start_time
+        if elapsed >= settings.TASK_MANAGER_TIMEOUT:
+            logger.warning(f"{self.prefix} manager has run for {elapsed} which is greater than TASK_MANAGER_TIMEOUT of {settings.TASK_MANAGER_TIMEOUT}.")
+            return True
+        return False

    @timeit
-    def get_tasks(self, status_list=('pending', 'waiting', 'running')):
-        jobs = [j for j in Job.objects.filter(status__in=status_list).prefetch_related('instance_group')]
-        inventory_updates_qs = (
-            InventoryUpdate.objects.filter(status__in=status_list).exclude(source='file').prefetch_related('inventory_source', 'instance_group')
+    def get_tasks(self, filter_args):
+        wf_approval_ctype_id = ContentType.objects.get_for_model(WorkflowApproval).id
+        qs = (
+            UnifiedJob.objects.filter(**filter_args)
+            .exclude(launch_type='sync')
+            .exclude(polymorphic_ctype_id=wf_approval_ctype_id)
+            .order_by('created')
+            .prefetch_related('dependent_jobs')
        )
-        inventory_updates = [i for i in inventory_updates_qs]
-        # Notice the job_type='check': we want to prevent implicit project updates from blocking our jobs.
-        project_updates = [p for p in ProjectUpdate.objects.filter(status__in=status_list, job_type='check').prefetch_related('instance_group')]
-        system_jobs = [s for s in SystemJob.objects.filter(status__in=status_list).prefetch_related('instance_group')]
-        ad_hoc_commands = [a for a in AdHocCommand.objects.filter(status__in=status_list).prefetch_related('instance_group')]
-        workflow_jobs = [w for w in WorkflowJob.objects.filter(status__in=status_list)]
-        all_tasks = sorted(jobs + project_updates + inventory_updates + system_jobs + ad_hoc_commands + workflow_jobs, key=lambda task: task.created)
-        return all_tasks
+        self.all_tasks = [t for t in qs]

-    def get_running_workflow_jobs(self):
-        graph_workflow_jobs = [wf for wf in WorkflowJob.objects.filter(status='running')]
-        return graph_workflow_jobs
+    def record_aggregate_metrics(self, *args):
+        if not settings.IS_TESTING():
+            # increment task_manager_schedule_calls regardless if the other
+            # metrics are recorded
+            s_metrics.Metrics(auto_pipe_execute=True).inc(f"{self.prefix}__schedule_calls", 1)
+            # Only record metrics if the last time recording was more
+            # than SUBSYSTEM_METRICS_TASK_MANAGER_RECORD_INTERVAL ago.
+            # Prevents a short-duration task manager that runs directly after a
+            # long task manager to override useful metrics.
+            current_time = time.time()
+            time_last_recorded = current_time - self.subsystem_metrics.decode(f"{self.prefix}_recorded_timestamp")
+            if time_last_recorded > settings.SUBSYSTEM_METRICS_TASK_MANAGER_RECORD_INTERVAL:
+                logger.debug(f"recording {self.prefix} metrics, last recorded {time_last_recorded} seconds ago")
+                self.subsystem_metrics.set(f"{self.prefix}_recorded_timestamp", current_time)
+                self.subsystem_metrics.pipe_execute()
+            else:
+                logger.debug(f"skipping recording {self.prefix} metrics, last recorded {time_last_recorded} seconds ago")

-    def get_inventory_source_tasks(self, all_sorted_tasks):
-        inventory_ids = set()
-        for task in all_sorted_tasks:
-            if isinstance(task, Job):
-                inventory_ids.add(task.inventory_id)
-        return [invsrc for invsrc in InventorySource.objects.filter(inventory_id__in=inventory_ids, update_on_launch=True)]
+    def record_aggregate_metrics_and_exit(self, *args):
+        self.record_aggregate_metrics()
+        sys.exit(1)
+
+    def schedule(self):
+        # Lock
+        with task_manager_bulk_reschedule():
+            with advisory_lock(f"{self.prefix}_lock", wait=False) as acquired:
+                with transaction.atomic():
+                    if acquired is False:
+                        logger.debug(f"Not running {self.prefix} scheduler, another task holds lock")
+                        return
+                    logger.debug(f"Starting {self.prefix} Scheduler")
+                    # if sigterm due to timeout, still record metrics
+                    signal.signal(signal.SIGTERM, self.record_aggregate_metrics_and_exit)
+                    self._schedule()
+                    commit_start = time.time()
+
+                if self.prefix == "task_manager":
+                    self.subsystem_metrics.set(f"{self.prefix}_commit_seconds", time.time() - commit_start)
+                self.record_aggregate_metrics()
+                logger.debug(f"Finishing {self.prefix} Scheduler")
+
+
+class WorkflowManager(TaskBase):
+    def __init__(self):
+        super().__init__(prefix="workflow_manager")

    @timeit
-    def spawn_workflow_graph_jobs(self, workflow_jobs):
-        for workflow_job in workflow_jobs:
-            if workflow_job.cancel_flag:
-                logger.debug('Not spawning jobs for %s because it is pending cancelation.', workflow_job.log_format)
-                continue
-            dag = WorkflowDAG(workflow_job)
-            spawn_nodes = dag.bfs_nodes_to_run()
-            if spawn_nodes:
-                logger.debug('Spawning jobs for %s', workflow_job.log_format)
-            else:
-                logger.debug('No nodes to spawn for %s', workflow_job.log_format)
-            for spawn_node in spawn_nodes:
-                if spawn_node.unified_job_template is None:
-                    continue
-                kv = spawn_node.get_job_kwargs()
-                job = spawn_node.unified_job_template.create_unified_job(**kv)
-                spawn_node.job = job
-                spawn_node.save()
-                logger.debug('Spawned %s in %s for node %s', job.log_format, workflow_job.log_format, spawn_node.pk)
-                can_start = True
-                if isinstance(spawn_node.unified_job_template, WorkflowJobTemplate):
-                    workflow_ancestors = job.get_ancestor_workflows()
-                    if spawn_node.unified_job_template in set(workflow_ancestors):
-                        can_start = False
-                        logger.info(
-                            'Refusing to start recursive workflow-in-workflow id={}, wfjt={}, ancestors={}'.format(
-                                job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors]
-                            )
-                        )
-                        display_list = [spawn_node.unified_job_template] + workflow_ancestors
-                        job.job_explanation = gettext_noop(
-                            "Workflow Job spawned from workflow could not start because it " "would result in recursion (spawn order, most recent first: {})"
-                        ).format(', '.join(['<{}>'.format(tmp) for tmp in display_list]))
-                    else:
-                        logger.debug(
-                            'Starting workflow-in-workflow id={}, wfjt={}, ancestors={}'.format(
-                                job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors]
-                            )
-                        )
-                if not job._resources_sufficient_for_launch():
-                    can_start = False
-                    job.job_explanation = gettext_noop(
-                        "Job spawned from workflow could not start because it " "was missing a related resource such as project or inventory"
-                    )
-                if can_start:
-                    if workflow_job.start_args:
-                        start_args = json.loads(decrypt_field(workflow_job, 'start_args'))
-                    else:
-                        start_args = {}
-                    can_start = job.signal_start(**start_args)
-                    if not can_start:
-                        job.job_explanation = gettext_noop(
-                            "Job spawned from workflow could not start because it " "was not in the right state or required manual credentials"
-                        )
-                if not can_start:
-                    job.status = 'failed'
-                    job.save(update_fields=['status', 'job_explanation'])
-                    job.websocket_emit_status('failed')
-
-                # TODO: should we emit a status on the socket here similar to tasks.py awx_periodic_scheduler() ?
-                # emit_websocket_notification('/socket.io/jobs', '', dict(id=))
-
-    def process_finished_workflow_jobs(self, workflow_jobs):
+    def spawn_workflow_graph_jobs(self):
        result = []
-        for workflow_job in workflow_jobs:
+        for workflow_job in self.all_tasks:
+            if self.timed_out():
+                logger.warning("Workflow manager has reached time out while processing running workflows, exiting loop early")
+                ScheduleWorkflowManager().schedule()
+                # Do not process any more workflow jobs. Stop here.
+                # Maybe we should schedule another WorkflowManager run
+                break
            dag = WorkflowDAG(workflow_job)
            status_changed = False
            if workflow_job.cancel_flag:
@@ -228,99 +166,106 @@ class TaskManager:
                    status_changed = True
            else:
                workflow_nodes = dag.mark_dnr_nodes()
-                for n in workflow_nodes:
-                    n.save(update_fields=['do_not_run'])
+                WorkflowJobNode.objects.bulk_update(workflow_nodes, ['do_not_run'])
+                # If workflow is now done, we do special things to mark it as done.
                is_done = dag.is_workflow_done()
-                if not is_done:
-                    continue
-                has_failed, reason = dag.has_workflow_failed()
-                logger.debug('Marking %s as %s.', workflow_job.log_format, 'failed' if has_failed else 'successful')
-                result.append(workflow_job.id)
-                new_status = 'failed' if has_failed else 'successful'
-                logger.debug("Transitioning {} to {} status.".format(workflow_job.log_format, new_status))
-                update_fields = ['status', 'start_args']
-                workflow_job.status = new_status
-                if reason:
-                    logger.info(f'Workflow job {workflow_job.id} failed due to reason: {reason}')
-                    workflow_job.job_explanation = gettext_noop("No error handling paths found, marking workflow as failed")
-                    update_fields.append('job_explanation')
-                workflow_job.start_args = ''  # blank field to remove encrypted passwords
-                workflow_job.save(update_fields=update_fields)
-                status_changed = True
+                if is_done:
+                    has_failed, reason = dag.has_workflow_failed()
+                    logger.debug('Marking %s as %s.', workflow_job.log_format, 'failed' if has_failed else 'successful')
+                    result.append(workflow_job.id)
+                    new_status = 'failed' if has_failed else 'successful'
+                    logger.debug("Transitioning {} to {} status.".format(workflow_job.log_format, new_status))
+                    update_fields = ['status', 'start_args']
+                    workflow_job.status = new_status
+                    if reason:
+                        logger.info(f'Workflow job {workflow_job.id} failed due to reason: {reason}')
+                        workflow_job.job_explanation = gettext_noop("No error handling paths found, marking workflow as failed")
+                        update_fields.append('job_explanation')
+                    workflow_job.start_args = ''  # blank field to remove encrypted passwords
+                    workflow_job.save(update_fields=update_fields)
+                    status_changed = True
+
            if status_changed:
                if workflow_job.spawned_by_workflow:
-                    schedule_task_manager()
+                    ScheduleWorkflowManager().schedule()
                workflow_job.websocket_emit_status(workflow_job.status)
                # Operations whose queries rely on modifications made during the atomic scheduling session
                workflow_job.send_notification_templates('succeeded' if workflow_job.status == 'successful' else 'failed')
+
+            if workflow_job.status == 'running':
+                spawn_nodes = dag.bfs_nodes_to_run()
+                if spawn_nodes:
+                    logger.debug('Spawning jobs for %s', workflow_job.log_format)
+                else:
+                    logger.debug('No nodes to spawn for %s', workflow_job.log_format)
+                for spawn_node in spawn_nodes:
+                    if spawn_node.unified_job_template is None:
+                        continue
+                    kv = spawn_node.get_job_kwargs()
+                    job = spawn_node.unified_job_template.create_unified_job(**kv)
+                    spawn_node.job = job
+                    spawn_node.save()
+                    logger.debug('Spawned %s in %s for node %s', job.log_format, workflow_job.log_format, spawn_node.pk)
+                    can_start = True
+                    if isinstance(spawn_node.unified_job_template, WorkflowJobTemplate):
+                        workflow_ancestors = job.get_ancestor_workflows()
+                        if spawn_node.unified_job_template in set(workflow_ancestors):
+                            can_start = False
+                            logger.info(
+                                'Refusing to start recursive workflow-in-workflow id={}, wfjt={}, ancestors={}'.format(
+                                    job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors]
+                                )
+                            )
+                            display_list = [spawn_node.unified_job_template] + workflow_ancestors
+                            job.job_explanation = gettext_noop(
+                                "Workflow Job spawned from workflow could not start because it "
+                                "would result in recursion (spawn order, most recent first: {})"
+                            ).format(', '.join('<{}>'.format(tmp) for tmp in display_list))
+                        else:
+                            logger.debug(
+                                'Starting workflow-in-workflow id={}, wfjt={}, ancestors={}'.format(
+                                    job.id, spawn_node.unified_job_template.pk, [wa.pk for wa in workflow_ancestors]
+                                )
+                            )
+                    if not job._resources_sufficient_for_launch():
+                        can_start = False
+                        job.job_explanation = gettext_noop(
+                            "Job spawned from workflow could not start because it was missing a related resource such as project or inventory"
+                        )
+                    if can_start:
+                        if workflow_job.start_args:
+                            start_args = json.loads(decrypt_field(workflow_job, 'start_args'))
+                        else:
+                            start_args = {}
+                        can_start = job.signal_start(**start_args)
+                        if not can_start:
+                            job.job_explanation = gettext_noop(
+                                "Job spawned from workflow could not start because it was not in the right state or required manual credentials"
+                            )
+                    if not can_start:
+                        job.status = 'failed'
+                        job.save(update_fields=['status', 'job_explanation'])
+                        job.websocket_emit_status('failed')
+
+                    # TODO: should we emit a status on the socket here similar to tasks.py awx_periodic_scheduler() ?
+                    # emit_websocket_notification('/socket.io/jobs', '', dict(id=))
+
        return result

    @timeit
-    def start_task(self, task, instance_group, dependent_tasks=None, instance=None):
-        self.subsystem_metrics.inc("task_manager_tasks_started", 1)
-        self.start_task_limit -= 1
-        if self.start_task_limit == 0:
-            # schedule another run immediately after this task manager
-            schedule_task_manager()
-        from awx.main.tasks.system import handle_work_error, handle_work_success
-
-        dependent_tasks = dependent_tasks or []
-
-        task_actual = {
-            'type': get_type_for_model(type(task)),
-            'id': task.id,
-        }
-        dependencies = [{'type': get_type_for_model(type(t)), 'id': t.id} for t in dependent_tasks]
-
-        task.status = 'waiting'
-
-        (start_status, opts) = task.pre_start()
-        if not start_status:
-            task.status = 'failed'
-            if task.job_explanation:
-                task.job_explanation += ' '
-            task.job_explanation += 'Task failed pre-start check.'
-            task.save()
-            # TODO: run error handler to fail sub-tasks and send notifications
-        else:
-            if type(task) is WorkflowJob:
-                task.status = 'running'
-                task.send_notification_templates('running')
-                logger.debug('Transitioning %s to running status.', task.log_format)
-                schedule_task_manager()
-            # at this point we already have control/execution nodes selected for the following cases
-            else:
-                task.instance_group = instance_group
-                execution_node_msg = f' and execution node {task.execution_node}' if task.execution_node else ''
-                logger.debug(
-                    f'Submitting job {task.log_format} controlled by {task.controller_node} to instance group {instance_group.name}{execution_node_msg}.'
-                )
-            with disable_activity_stream():
-                task.celery_task_id = str(uuid.uuid4())
-                task.save()
-                task.log_lifecycle("waiting")
-
-        def post_commit():
-            if task.status != 'failed' and type(task) is not WorkflowJob:
-                # Before task is dispatched, ensure that job_event partitions exist
-                create_partition(task.event_class._meta.db_table, start=task.created)
-                task_cls = task._get_task_class()
-                task_cls.apply_async(
-                    [task.pk],
-                    opts,
-                    queue=task.get_queue_name(),
-                    uuid=task.celery_task_id,
-                    callbacks=[{'task': handle_work_success.name, 'kwargs': {'task_actual': task_actual}}],
-                    errbacks=[{'task': handle_work_error.name, 'args': [task.celery_task_id], 'kwargs': {'subtasks': [task_actual] + dependencies}}],
-                )
-
-        task.websocket_emit_status(task.status)  # adds to on_commit
-        connection.on_commit(post_commit)
+    def get_tasks(self, filter_args):
+        self.all_tasks = [wf for wf in WorkflowJob.objects.filter(**filter_args)]

    @timeit
-    def process_running_tasks(self, running_tasks):
-        for task in running_tasks:
-            self.dependency_graph.add_job(task)
+    def _schedule(self):
+        self.get_tasks(dict(status__in=["running"], dependencies_processed=True))
+        if len(self.all_tasks) > 0:
+            self.spawn_workflow_graph_jobs()
+
+
+class DependencyManager(TaskBase):
+    def __init__(self):
+        super().__init__(prefix="dependency_manager")

    def create_project_update(self, task, project_id=None):
        if project_id is None:
@@ -341,14 +286,20 @@ class TaskManager:
        inventory_task.status = 'pending'
        inventory_task.save()
        logger.debug('Spawned {} as dependency of {}'.format(inventory_task.log_format, task.log_format))
-        # inventory_sources = self.get_inventory_source_tasks([task])
-        # self.process_inventory_sources(inventory_sources)
+
        return inventory_task

    def add_dependencies(self, task, dependencies):
        with disable_activity_stream():
            task.dependent_jobs.add(*dependencies)

+    def get_inventory_source_tasks(self):
+        inventory_ids = set()
+        for task in self.all_tasks:
+            if isinstance(task, Job):
+                inventory_ids.add(task.inventory_id)
+        self.all_inventory_sources = [invsrc for invsrc in InventorySource.objects.filter(inventory_id__in=inventory_ids, update_on_launch=True)]
+
    def get_latest_inventory_update(self, inventory_source):
        latest_inventory_update = InventoryUpdate.objects.filter(inventory_source=inventory_source).order_by("-created")
        if not latest_inventory_update.exists():
@@ -481,16 +432,167 @@ class TaskManager:

        return created_dependencies

+    def process_tasks(self):
+        deps = self.generate_dependencies(self.all_tasks)
+        self.generate_dependencies(deps)
+        self.subsystem_metrics.inc(f"{self.prefix}_pending_processed", len(self.all_tasks) + len(deps))
+
+    @timeit
+    def _schedule(self):
+        self.get_tasks(dict(status__in=["pending"], dependencies_processed=False))
+
+        if len(self.all_tasks) > 0:
+            self.get_inventory_source_tasks()
+            self.process_tasks()
+            ScheduleTaskManager().schedule()
+
+
+class TaskManager(TaskBase):
+    def __init__(self):
+        """
+        Do NOT put database queries or other potentially expensive operations
+        in the task manager init. The task manager object is created every time a
+        job is created, transitions state, and every 30 seconds on each tower node.
+        More often then not, the object is destroyed quickly because the NOOP case is hit.
+
+        The NOOP case is short-circuit logic. If the task manager realizes that another instance
+        of the task manager is already running, then it short-circuits and decides not to run.
+        """
+        # start task limit indicates how many pending jobs can be started on this
+        # .schedule() run. Starting jobs is expensive, and there is code in place to reap
+        # the task manager after 5 minutes. At scale, the task manager can easily take more than
+        # 5 minutes to start pending jobs. If this limit is reached, pending jobs
+        # will no longer be started and will be started on the next task manager cycle.
+        self.time_delta_job_explanation = timedelta(seconds=30)
+        super().__init__(prefix="task_manager")
+
+    def after_lock_init(self):
+        """
+        Init AFTER we know this instance of the task manager will run because the lock is acquired.
+        """
+        self.dependency_graph = DependencyGraph()
+        self.instances = TaskManagerInstances(self.all_tasks)
+        self.instance_groups = TaskManagerInstanceGroups(instances_by_hostname=self.instances)
+        self.controlplane_ig = self.instance_groups.controlplane_ig
+
+    def job_blocked_by(self, task):
+        # TODO: I'm not happy with this, I think blocking behavior should be decided outside of the dependency graph
+        #       in the old task manager this was handled as a method on each task object outside of the graph and
+        #       probably has the side effect of cutting down *a lot* of the logic from this task manager class
+        blocked_by = self.dependency_graph.task_blocked_by(task)
+        if blocked_by:
+            return blocked_by
+
+        for dep in task.dependent_jobs.all():
+            if dep.status in ACTIVE_STATES:
+                return dep
+            # if we detect a failed or error dependency, go ahead and fail this
+            # task. The errback on the dependency takes some time to trigger,
+            # and we don't want the task to enter running state if its
+            # dependency has failed or errored.
+            elif dep.status in ("error", "failed"):
+                task.status = 'failed'
+                task.job_explanation = 'Previous Task Failed: {"job_type": "%s", "job_name": "%s", "job_id": "%s"}' % (
+                    get_type_for_model(type(dep)),
+                    dep.name,
+                    dep.id,
+                )
+                task.save(update_fields=['status', 'job_explanation'])
+                task.websocket_emit_status('failed')
+                return dep
+
+        return None
+
+    @timeit
+    def start_task(self, task, instance_group, dependent_tasks=None, instance=None):
+        self.dependency_graph.add_job(task)
+        self.subsystem_metrics.inc(f"{self.prefix}_tasks_started", 1)
+        self.start_task_limit -= 1
+        if self.start_task_limit == 0:
+            # schedule another run immediately after this task manager
+            ScheduleTaskManager().schedule()
+        from awx.main.tasks.system import handle_work_error, handle_work_success
+
+        # update capacity for control node and execution node
+        if task.controller_node:
+            self.instances[task.controller_node].consume_capacity(settings.AWX_CONTROL_NODE_TASK_IMPACT)
+        if task.execution_node:
+            self.instances[task.execution_node].consume_capacity(task.task_impact)
+
+        dependent_tasks = dependent_tasks or []
+
+        task_actual = {
+            'type': get_type_for_model(type(task)),
+            'id': task.id,
+        }
+        dependencies = [{'type': get_type_for_model(type(t)), 'id': t.id} for t in dependent_tasks]
+
+        task.status = 'waiting'
+
+        (start_status, opts) = task.pre_start()
+        if not start_status:
+            task.status = 'failed'
+            if task.job_explanation:
+                task.job_explanation += ' '
+            task.job_explanation += 'Task failed pre-start check.'
+            task.save()
+            # TODO: run error handler to fail sub-tasks and send notifications
+        else:
+            if type(task) is WorkflowJob:
+                task.status = 'running'
+                task.send_notification_templates('running')
+                logger.debug('Transitioning %s to running status.', task.log_format)
+                # Call this to ensure Workflow nodes get spawned in timely manner
+                ScheduleWorkflowManager().schedule()
+            # at this point we already have control/execution nodes selected for the following cases
+            else:
+                task.instance_group = instance_group
+                execution_node_msg = f' and execution node {task.execution_node}' if task.execution_node else ''
+                logger.debug(
+                    f'Submitting job {task.log_format} controlled by {task.controller_node} to instance group {instance_group.name}{execution_node_msg}.'
+                )
+            with disable_activity_stream():
+                task.celery_task_id = str(uuid.uuid4())
+                task.save()
+                task.log_lifecycle("waiting")
+
+        # apply_async does a NOTIFY to the channel dispatcher is listening to
+        # postgres will treat this as part of the transaction, which is what we want
+        if task.status != 'failed' and type(task) is not WorkflowJob:
+            task_cls = task._get_task_class()
+            task_cls.apply_async(
+                [task.pk],
+                opts,
+                queue=task.get_queue_name(),
+                uuid=task.celery_task_id,
+                callbacks=[{'task': handle_work_success.name, 'kwargs': {'task_actual': task_actual}}],
+                errbacks=[{'task': handle_work_error.name, 'args': [task.celery_task_id], 'kwargs': {'subtasks': [task_actual] + dependencies}}],
+            )
+
+        # In exception cases, like a job failing pre-start checks, we send the websocket status message
+        # for jobs going into waiting, we omit this because of performance issues, as it should go to running quickly
+        if task.status != 'waiting':
+            task.websocket_emit_status(task.status)  # adds to on_commit
+
+    @timeit
+    def process_running_tasks(self, running_tasks):
+        for task in running_tasks:
+            if type(task) is WorkflowJob:
+                ScheduleWorkflowManager().schedule()
+            self.dependency_graph.add_job(task)
+
    @timeit
    def process_pending_tasks(self, pending_tasks):
-        running_workflow_templates = {wf.unified_job_template_id for wf in self.get_running_workflow_jobs()}
        tasks_to_update_job_explanation = []
        for task in pending_tasks:
            if self.start_task_limit <= 0:
                break
+            if self.timed_out():
+                logger.warning("Task manager has reached time out while processing pending jobs, exiting loop early")
+                break
            blocked_by = self.job_blocked_by(task)
            if blocked_by:
-                self.subsystem_metrics.inc("task_manager_tasks_blocked", 1)
+                self.subsystem_metrics.inc(f"{self.prefix}_tasks_blocked", 1)
                task.log_lifecycle("blocked", blocked_by=blocked_by)
                job_explanation = gettext_noop(f"waiting for {blocked_by._meta.model_name}-{blocked_by.id} to finish")
                if task.job_explanation != job_explanation:
@@ -499,19 +601,16 @@ class TaskManager:
                        tasks_to_update_job_explanation.append(task)
                continue

-            found_acceptable_queue = False
-            preferred_instance_groups = task.preferred_instance_groups
-
            if isinstance(task, WorkflowJob):
-                if task.unified_job_template_id in running_workflow_templates:
-                    if not task.allow_simultaneous:
-                        logger.debug("{} is blocked from running, workflow already running".format(task.log_format))
-                        continue
-                else:
-                    running_workflow_templates.add(task.unified_job_template_id)
+                # Previously we were tracking allow_simultaneous blocking both here and in DependencyGraph.
+                # Double check that using just the DependencyGraph works for Workflows and Sliced Jobs.
                self.start_task(task, None, task.get_jobs_fail_chain(), None)
                continue

+            found_acceptable_queue = False
+
+            preferred_instance_groups = self.instance_groups.get_instance_groups_from_task_cache(task)
+
            # Determine if there is control capacity for the task
            if task.capacity_type == 'control':
                control_impact = task.task_impact + settings.AWX_CONTROL_NODE_TASK_IMPACT
@@ -530,8 +629,6 @@ class TaskManager:
            # All task.capacity_type == 'control' jobs should run on control plane, no need to loop over instance groups
            if task.capacity_type == 'control':
                task.execution_node = control_instance.hostname
-                control_instance.consume_capacity(control_impact)
-                self.dependency_graph.add_job(task)
                execution_instance = self.instances[control_instance.hostname].obj
                task.log_lifecycle("controller_node_chosen")
                task.log_lifecycle("execution_node_chosen")
@@ -541,7 +638,6 @@ class TaskManager:

            for instance_group in preferred_instance_groups:
                if instance_group.is_container_group:
-                    self.dependency_graph.add_job(task)
                    self.start_task(task, instance_group, task.get_jobs_fail_chain(), None)
                    found_acceptable_queue = True
                    break
@@ -563,9 +659,7 @@ class TaskManager:
                        control_instance = execution_instance
                        task.controller_node = execution_instance.hostname

-                    control_instance.consume_capacity(settings.AWX_CONTROL_NODE_TASK_IMPACT)
                    task.log_lifecycle("controller_node_chosen")
-                    execution_instance.consume_capacity(task.task_impact)
                    task.log_lifecycle("execution_node_chosen")
                    logger.debug(
                        "Starting {} in group {} instance {} (remaining_capacity={})".format(
@@ -573,7 +667,6 @@ class TaskManager:
                        )
                    )
                    execution_instance = self.instances[execution_instance.hostname].obj
-                    self.dependency_graph.add_job(task)
                    self.start_task(task, instance_group, task.get_jobs_fail_chain(), execution_instance)
                    found_acceptable_queue = True
                    break
@@ -599,25 +692,6 @@ class TaskManager:
                tasks_to_update_job_explanation.append(task)
        logger.debug("{} couldn't be scheduled on graph, waiting for next cycle".format(task.log_format))

-    def timeout_approval_node(self):
-        workflow_approvals = WorkflowApproval.objects.filter(status='pending')
-        now = tz_now()
-        for task in workflow_approvals:
-            approval_timeout_seconds = timedelta(seconds=task.timeout)
-            if task.timeout == 0:
-                continue
-            if (now - task.created) >= approval_timeout_seconds:
-                timeout_message = _("The approval node {name} ({pk}) has expired after {timeout} seconds.").format(
-                    name=task.name, pk=task.pk, timeout=task.timeout
-                )
-                logger.warning(timeout_message)
-                task.timed_out = True
-                task.status = 'failed'
-                task.send_approval_notification('timed_out')
-                task.websocket_emit_status(task.status)
-                task.job_explanation = timeout_message
-                task.save(update_fields=['status', 'job_explanation', 'timed_out'])
-
    def reap_jobs_from_orphaned_instances(self):
        # discover jobs that are in running state but aren't on an execution node
        # that we know about; this is a fairly rare event, but it can occur if you,
@@ -630,92 +704,45 @@ class TaskManager:
                logger.error(f'{j.execution_node} is not a registered instance; reaping {j.log_format}')
                reap_job(j, 'failed')

-    def process_tasks(self, all_sorted_tasks):
-        running_tasks = [t for t in all_sorted_tasks if t.status in ['waiting', 'running']]
+    def process_tasks(self):
+        running_tasks = [t for t in self.all_tasks if t.status in ['waiting', 'running']]
        self.process_running_tasks(running_tasks)
-        self.subsystem_metrics.inc("task_manager_running_processed", len(running_tasks))
+        self.subsystem_metrics.inc(f"{self.prefix}_running_processed", len(running_tasks))

-        pending_tasks = [t for t in all_sorted_tasks if t.status == 'pending']
-
-        undeped_tasks = [t for t in pending_tasks if not t.dependencies_processed]
-        dependencies = self.generate_dependencies(undeped_tasks)
-        deps_of_deps = self.generate_dependencies(dependencies)
-        dependencies += deps_of_deps
-        self.process_pending_tasks(dependencies)
-        self.subsystem_metrics.inc("task_manager_pending_processed", len(dependencies))
+        pending_tasks = [t for t in self.all_tasks if t.status == 'pending']

        self.process_pending_tasks(pending_tasks)
-        self.subsystem_metrics.inc("task_manager_pending_processed", len(pending_tasks))
+        self.subsystem_metrics.inc(f"{self.prefix}_pending_processed", len(pending_tasks))
+
+    def timeout_approval_node(self, task):
+        if self.timed_out():
+            logger.warning("Task manager has reached time out while processing approval nodes, exiting loop early")
+            # Do not process any more workflow approval nodes. Stop here.
+            # Maybe we should schedule another TaskManager run
+            return
+        timeout_message = _("The approval node {name} ({pk}) has expired after {timeout} seconds.").format(name=task.name, pk=task.pk, timeout=task.timeout)
+        logger.warning(timeout_message)
+        task.timed_out = True
+        task.status = 'failed'
+        task.send_approval_notification('timed_out')
+        task.websocket_emit_status(task.status)
+        task.job_explanation = timeout_message
+        task.save(update_fields=['status', 'job_explanation', 'timed_out'])
+
+    def get_expired_workflow_approvals(self):
+        # timeout of 0 indicates that it never expires
+        qs = WorkflowApproval.objects.filter(status='pending').exclude(timeout=0).filter(expires__lt=tz_now())
+        return qs

    @timeit
    def _schedule(self):
-        finished_wfjs = []
-        all_sorted_tasks = self.get_tasks()
+        self.get_tasks(dict(status__in=["pending", "waiting", "running"], dependencies_processed=True))

-        self.after_lock_init(all_sorted_tasks)
+        self.after_lock_init()
+        self.reap_jobs_from_orphaned_instances()

-        if len(all_sorted_tasks) > 0:
-            # TODO: Deal with
-            # latest_project_updates = self.get_latest_project_update_tasks(all_sorted_tasks)
-            # self.process_latest_project_updates(latest_project_updates)
+        if len(self.all_tasks) > 0:
+            self.process_tasks()

-            # latest_inventory_updates = self.get_latest_inventory_update_tasks(all_sorted_tasks)
-            # self.process_latest_inventory_updates(latest_inventory_updates)
-
-            self.all_inventory_sources = self.get_inventory_source_tasks(all_sorted_tasks)
-
-            running_workflow_tasks = self.get_running_workflow_jobs()
-            finished_wfjs = self.process_finished_workflow_jobs(running_workflow_tasks)
-
-            previously_running_workflow_tasks = running_workflow_tasks
-            running_workflow_tasks = []
-            for workflow_job in previously_running_workflow_tasks:
-                if workflow_job.status == 'running':
-                    running_workflow_tasks.append(workflow_job)
-                else:
-                    logger.debug('Removed %s from job spawning consideration.', workflow_job.log_format)
-
-            self.spawn_workflow_graph_jobs(running_workflow_tasks)
-
-            self.timeout_approval_node()
-            self.reap_jobs_from_orphaned_instances()
-
-            self.process_tasks(all_sorted_tasks)
-        return finished_wfjs
-
-    def record_aggregate_metrics(self, *args):
-        if not settings.IS_TESTING():
-            # increment task_manager_schedule_calls regardless if the other
-            # metrics are recorded
-            s_metrics.Metrics(auto_pipe_execute=True).inc("task_manager_schedule_calls", 1)
-            # Only record metrics if the last time recording was more
-            # than SUBSYSTEM_METRICS_TASK_MANAGER_RECORD_INTERVAL ago.
-            # Prevents a short-duration task manager that runs directly after a
-            # long task manager to override useful metrics.
-            current_time = time.time()
-            time_last_recorded = current_time - self.subsystem_metrics.decode("task_manager_recorded_timestamp")
-            if time_last_recorded > settings.SUBSYSTEM_METRICS_TASK_MANAGER_RECORD_INTERVAL:
-                logger.debug(f"recording metrics, last recorded {time_last_recorded} seconds ago")
-                self.subsystem_metrics.set("task_manager_recorded_timestamp", current_time)
-                self.subsystem_metrics.pipe_execute()
-            else:
-                logger.debug(f"skipping recording metrics, last recorded {time_last_recorded} seconds ago")
-
-    def record_aggregate_metrics_and_exit(self, *args):
-        self.record_aggregate_metrics()
-        sys.exit(1)
-
-    def schedule(self):
-        # Lock
-        with advisory_lock('task_manager_lock', wait=False) as acquired:
-            with transaction.atomic():
-                if acquired is False:
-                    logger.debug("Not running scheduler, another task holds lock")
-                    return
-                logger.debug("Starting Scheduler")
-                with task_manager_bulk_reschedule():
-                    # if sigterm due to timeout, still record metrics
-                    signal.signal(signal.SIGTERM, self.record_aggregate_metrics_and_exit)
-                    self._schedule()
-                    self.record_aggregate_metrics()
-                logger.debug("Finishing Scheduler")
+        for workflow_approval in self.get_expired_workflow_approvals():
+            self.timeout_approval_node(workflow_approval)
--- a/awx/main/scheduler/task_manager_models.py
+++ b/awx/main/scheduler/task_manager_models.py
@@ -34,14 +34,10 @@ class TaskManagerInstance:


 class TaskManagerInstances:
-    def __init__(self, active_tasks, instances=None):
+    def __init__(self, active_tasks, instances=None, instance_fields=('node_type', 'capacity', 'hostname', 'enabled')):
        self.instances_by_hostname = dict()
        if instances is None:
-            instances = (
-                Instance.objects.filter(hostname__isnull=False, node_state=Instance.States.READY, enabled=True)
-                .exclude(node_type='hop')
-                .only('node_type', 'node_state', 'capacity', 'hostname', 'enabled')
-            )
+            instances = Instance.objects.filter(hostname__isnull=False, enabled=True).exclude(node_type='hop').only(*instance_fields)
        for instance in instances:
            self.instances_by_hostname[instance.hostname] = TaskManagerInstance(instance)

@@ -69,6 +65,7 @@ class TaskManagerInstanceGroups:
    def __init__(self, instances_by_hostname=None, instance_groups=None, instance_groups_queryset=None):
        self.instance_groups = dict()
        self.controlplane_ig = None
+        self.pk_ig_map = dict()

        if instance_groups is not None:  # for testing
            self.instance_groups = instance_groups
@@ -83,6 +80,7 @@ class TaskManagerInstanceGroups:
                        instances_by_hostname[instance.hostname] for instance in instance_group.instances.all() if instance.hostname in instances_by_hostname
                    ],
                )
+                self.pk_ig_map[instance_group.pk] = instance_group

    def get_remaining_capacity(self, group_name):
        instances = self.instance_groups[group_name]['instances']
@@ -123,3 +121,17 @@ class TaskManagerInstanceGroups:
                elif i.capacity > largest_instance.capacity:
                    largest_instance = i
        return largest_instance
+
+    def get_instance_groups_from_task_cache(self, task):
+        igs = []
+        if task.preferred_instance_groups_cache:
+            for pk in task.preferred_instance_groups_cache:
+                ig = self.pk_ig_map.get(pk, None)
+                if ig:
+                    igs.append(ig)
+                else:
+                    logger.warn(f"Unknown instance group with pk {pk} for task {task}")
+        if len(igs) == 0:
+            logger.warn(f"No instance groups in cache exist, defaulting to global instance groups for task {task}")
+            return task.global_instance_groups
+        return igs
--- a/awx/main/scheduler/tasks.py
+++ b/awx/main/scheduler/tasks.py
@@ -1,15 +1,35 @@
 # Python
 import logging

+# Django
+from django.conf import settings
+
 # AWX
-from awx.main.scheduler import TaskManager
+from awx import MODE
+from awx.main.scheduler import TaskManager, DependencyManager, WorkflowManager
 from awx.main.dispatch.publish import task
 from awx.main.dispatch import get_local_queuename

 logger = logging.getLogger('awx.main.scheduler')


+def run_manager(manager, prefix):
+    if MODE == 'development' and settings.AWX_DISABLE_TASK_MANAGERS:
+        logger.debug(f"Not running {prefix} manager, AWX_DISABLE_TASK_MANAGERS is True. Trigger with GET to /api/debug/{prefix}_manager/")
+        return
+    manager().schedule()
+
+
@task(queue=get_local_queuename)
-def run_task_manager():
-    logger.debug("Running task manager.")
-    TaskManager().schedule()
+def task_manager():
+    run_manager(TaskManager, "task")
+
+
+@task(queue=get_local_queuename)
+def dependency_manager():
+    run_manager(DependencyManager, "dependency")
+
+
+@task(queue=get_local_queuename)
+def workflow_manager():
+    run_manager(WorkflowManager, "workflow")
--- a/awx/main/tasks/callback.py
+++ b/awx/main/tasks/callback.py
@@ -6,17 +6,16 @@ import os
 import stat

 # Django
-from django.utils.timezone import now
 from django.conf import settings
 from django_guid import get_guid
 from django.utils.functional import cached_property
+from django.db import connections

 # AWX
 from awx.main.redact import UriCleaner
 from awx.main.constants import MINIMAL_EVENTS, ANSIBLE_RUNNER_NEEDS_UPDATE_MESSAGE
 from awx.main.utils.update_model import update_model
 from awx.main.queue import CallbackQueueDispatcher
-from awx.main.tasks.signals import signal_callback

 logger = logging.getLogger('awx.main.tasks.callback')

@@ -175,28 +174,6 @@ class RunnerCallback:

        return False

-    def cancel_callback(self):
-        """
-        Ansible runner callback to tell the job when/if it is canceled
-        """
-        unified_job_id = self.instance.pk
-        if signal_callback():
-            return True
-        try:
-            self.instance = self.update_model(unified_job_id)
-        except Exception:
-            logger.exception(f'Encountered error during cancel check for {unified_job_id}, canceling now')
-            return True
-        if not self.instance:
-            logger.error('unified job {} was deleted while running, canceling'.format(unified_job_id))
-            return True
-        if self.instance.cancel_flag or self.instance.status == 'canceled':
-            cancel_wait = (now() - self.instance.modified).seconds if self.instance.modified else 0
-            if cancel_wait > 5:
-                logger.warning('Request to cancel {} took {} seconds to complete.'.format(self.instance.log_format, cancel_wait))
-            return True
-        return False
-
    def finished_callback(self, runner_obj):
        """
        Ansible runner callback triggered on finished run
@@ -227,6 +204,8 @@ class RunnerCallback:

            with disable_activity_stream():
                self.instance = self.update_model(self.instance.pk, job_args=json.dumps(runner_config.command), job_cwd=runner_config.cwd, job_env=job_env)
+            # We opened a connection just for that save, close it here now
+            connections.close_all()
        elif status_data['status'] == 'failed':
            # For encrypted ssh_key_data, ansible-runner worker will open and write the
            # ssh_key_data to a named pipe. Then, once the podman container starts, ssh-agent will
--- a/awx/main/tasks/jobs.py
+++ b/awx/main/tasks/jobs.py
@@ -1,6 +1,5 @@
 # Python
 from collections import OrderedDict
-from distutils.dir_util import copy_tree
 import errno
 import functools
 import fcntl
@@ -15,7 +14,6 @@ import tempfile
 import traceback
 import time
 import urllib.parse as urlparse
-from uuid import uuid4

 # Django
 from django.conf import settings
@@ -38,6 +36,7 @@ from awx.main.constants import (
    JOB_FOLDER_PREFIX,
    MAX_ISOLATED_PATH_COLON_DELIMITER,
    CONTAINER_VOLUMES_MOUNT_TYPES,
+    ACTIVE_STATES,
 )
 from awx.main.models import (
    Instance,
@@ -211,14 +210,22 @@ class BaseTask(object):
        os.chmod(path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
        if settings.AWX_CLEANUP_PATHS:
            self.cleanup_paths.append(path)
-        # Ansible runner requires that project exists,
-        # and we will write files in the other folders without pre-creating the folder
-        for subfolder in ('project', 'inventory', 'env'):
+        # We will write files in these folders later
+        for subfolder in ('inventory', 'env'):
            runner_subfolder = os.path.join(path, subfolder)
            if not os.path.exists(runner_subfolder):
                os.mkdir(runner_subfolder)
        return path

+    def build_project_dir(self, instance, private_data_dir):
+        """
+        Create the ansible-runner project subdirectory. In many cases this is the source checkout.
+        In cases that do not even need the source checkout, we create an empty dir to be the workdir.
+        """
+        project_dir = os.path.join(private_data_dir, 'project')
+        if not os.path.exists(project_dir):
+            os.mkdir(project_dir)
+
    def build_private_data_files(self, instance, private_data_dir):
        """
        Creates temporary files containing the private data.
@@ -354,12 +361,65 @@ class BaseTask(object):
            expect_passwords[k] = passwords.get(v, '') or ''
        return expect_passwords

+    def release_lock(self, project):
+        try:
+            fcntl.lockf(self.lock_fd, fcntl.LOCK_UN)
+        except IOError as e:
+            logger.error("I/O error({0}) while trying to release lock file [{1}]: {2}".format(e.errno, project.get_lock_file(), e.strerror))
+            os.close(self.lock_fd)
+            raise
+
+        os.close(self.lock_fd)
+        self.lock_fd = None
+
+    def acquire_lock(self, project, unified_job_id=None):
+        if not os.path.exists(settings.PROJECTS_ROOT):
+            os.mkdir(settings.PROJECTS_ROOT)
+
+        lock_path = project.get_lock_file()
+        if lock_path is None:
+            # If from migration or someone blanked local_path for any other reason, recoverable by save
+            project.save()
+            lock_path = project.get_lock_file()
+            if lock_path is None:
+                raise RuntimeError(u'Invalid lock file path')
+
+        try:
+            self.lock_fd = os.open(lock_path, os.O_RDWR | os.O_CREAT)
+        except OSError as e:
+            logger.error("I/O error({0}) while trying to open lock file [{1}]: {2}".format(e.errno, lock_path, e.strerror))
+            raise
+
+        start_time = time.time()
+        while True:
+            try:
+                fcntl.lockf(self.lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+                break
+            except IOError as e:
+                if e.errno not in (errno.EAGAIN, errno.EACCES):
+                    os.close(self.lock_fd)
+                    logger.error("I/O error({0}) while trying to aquire lock on file [{1}]: {2}".format(e.errno, lock_path, e.strerror))
+                    raise
+                else:
+                    time.sleep(1.0)
+            self.instance.refresh_from_db(fields=['cancel_flag'])
+            if self.instance.cancel_flag or signal_callback():
+                logger.debug(f"Unified job {self.instance.id} was canceled while waiting for project file lock")
+                return
+        waiting_time = time.time() - start_time
+
+        if waiting_time > 1.0:
+            logger.info(f'Job {unified_job_id} waited {waiting_time} to acquire lock for local source tree for path {lock_path}.')
+
    def pre_run_hook(self, instance, private_data_dir):
        """
        Hook for any steps to run before the job/task starts
        """
        instance.log_lifecycle("pre_run")

+        # Before task is started, ensure that job_event partitions exist
+        create_partition(instance.event_class._meta.db_table, start=instance.created)
+
    def post_run_hook(self, instance, status):
        """
        Hook for any steps to run before job/task is marked as complete.
@@ -372,15 +432,9 @@ class BaseTask(object):
        """
        instance.log_lifecycle("finalize_run")
        artifact_dir = os.path.join(private_data_dir, 'artifacts', str(self.instance.id))
-        job_profiling_dir = os.path.join(artifact_dir, 'playbook_profiling')
-        awx_profiling_dir = '/var/log/tower/playbook_profiling/'
        collections_info = os.path.join(artifact_dir, 'collections.json')
        ansible_version_file = os.path.join(artifact_dir, 'ansible_version.txt')

-        if not os.path.exists(awx_profiling_dir):
-            os.mkdir(awx_profiling_dir)
-        if os.path.isdir(job_profiling_dir):
-            shutil.copytree(job_profiling_dir, os.path.join(awx_profiling_dir, str(instance.pk)))
        if os.path.exists(collections_info):
            with open(collections_info) as ee_json_info:
                ee_collections_info = json.loads(ee_json_info.read())
@@ -399,6 +453,11 @@ class BaseTask(object):
        Run the job/task and capture its output.
        """
        self.instance = self.model.objects.get(pk=pk)
+        if self.instance.status != 'canceled' and self.instance.cancel_flag:
+            self.instance = self.update_model(self.instance.pk, start_args='', status='canceled')
+        if self.instance.status not in ACTIVE_STATES:
+            # Prevent starting the job if it has been reaped or handled by another process.
+            raise RuntimeError(f'Not starting {self.instance.status} task pk={pk} because {self.instance.status} is not a valid active state')

        if self.instance.execution_environment_id is None:
            from awx.main.signals import disable_activity_stream
@@ -424,9 +483,11 @@ class BaseTask(object):
            self.instance.send_notification_templates("running")
            private_data_dir = self.build_private_data_dir(self.instance)
            self.pre_run_hook(self.instance, private_data_dir)
+            self.build_project_dir(self.instance, private_data_dir)
            self.instance.log_lifecycle("preparing_playbook")
            if self.instance.cancel_flag or signal_callback():
                self.instance = self.update_model(self.instance.pk, status='canceled')
+
            if self.instance.status != 'running':
                # Stop the task chain and prevent starting the job if it has
                # already been canceled.
@@ -529,7 +590,7 @@ class BaseTask(object):
                    event_handler=self.runner_callback.event_handler,
                    finished_callback=self.runner_callback.finished_callback,
                    status_handler=self.runner_callback.status_handler,
-                    cancel_callback=self.runner_callback.cancel_callback,
+                    cancel_callback=signal_callback,
                    **params,
                )
            else:
@@ -549,8 +610,12 @@ class BaseTask(object):
                    status = 'failed'
            elif status == 'canceled':
                self.instance = self.update_model(pk)
-                if (getattr(self.instance, 'cancel_flag', False) is False) and signal_callback():
-                    self.runner_callback.delay_update(job_explanation="Task was canceled due to receiving a shutdown signal.")
+                cancel_flag_value = getattr(self.instance, 'cancel_flag', False)
+                if (cancel_flag_value is False) and signal_callback():
+                    self.runner_callback.delay_update(skip_if_already_set=True, job_explanation="Task was canceled due to receiving a shutdown signal.")
+                    status = 'failed'
+                elif cancel_flag_value is False:
+                    self.runner_callback.delay_update(skip_if_already_set=True, job_explanation="The running ansible process received a shutdown signal.")
                    status = 'failed'
        except ReceptorNodeNotFound as exc:
            self.runner_callback.delay_update(job_explanation=str(exc))
@@ -593,8 +658,143 @@ class BaseTask(object):
                raise AwxTaskError.TaskError(self.instance, rc)


+class SourceControlMixin(BaseTask):
+    """Utility methods for tasks that run use content from source control"""
+
+    def get_sync_needs(self, project, scm_branch=None):
+        project_path = project.get_project_path(check_if_exists=False)
+        job_revision = project.scm_revision
+        sync_needs = []
+        source_update_tag = 'update_{}'.format(project.scm_type)
+        branch_override = bool(scm_branch and scm_branch != project.scm_branch)
+        # TODO: skip syncs for inventory updates. Now, UI needs a link added so clients can link to project
+        # source_project is only a field on inventory sources.
+        if isinstance(self.instance, InventoryUpdate):
+            sync_needs.append(source_update_tag)
+        elif not project.scm_type:
+            pass  # manual projects are not synced, user has responsibility for that
+        elif not os.path.exists(project_path):
+            logger.debug(f'Performing fresh clone of {project.id} for unified job {self.instance.id} on this instance.')
+            sync_needs.append(source_update_tag)
+        elif project.scm_type == 'git' and project.scm_revision and (not branch_override):
+            try:
+                git_repo = git.Repo(project_path)
+
+                if job_revision == git_repo.head.commit.hexsha:
+                    logger.debug(f'Skipping project sync for {self.instance.id} because commit is locally available')
+                else:
+                    sync_needs.append(source_update_tag)
+            except (ValueError, BadGitName, git.exc.InvalidGitRepositoryError):
+                logger.debug(f'Needed commit for {self.instance.id} not in local source tree, will sync with remote')
+                sync_needs.append(source_update_tag)
+        else:
+            logger.debug(f'Project not available locally, {self.instance.id} will sync with remote')
+            sync_needs.append(source_update_tag)
+
+        has_cache = os.path.exists(os.path.join(project.get_cache_path(), project.cache_id))
+        # Galaxy requirements are not supported for manual projects
+        if project.scm_type and ((not has_cache) or branch_override):
+            sync_needs.extend(['install_roles', 'install_collections'])
+
+        return sync_needs
+
+    def spawn_project_sync(self, project, sync_needs, scm_branch=None):
+        pu_ig = self.instance.instance_group
+        pu_en = Instance.objects.me().hostname
+
+        sync_metafields = dict(
+            launch_type="sync",
+            job_type='run',
+            job_tags=','.join(sync_needs),
+            status='running',
+            instance_group=pu_ig,
+            execution_node=pu_en,
+            controller_node=pu_en,
+            celery_task_id=self.instance.celery_task_id,
+        )
+        if scm_branch and scm_branch != project.scm_branch:
+            sync_metafields['scm_branch'] = scm_branch
+            sync_metafields['scm_clean'] = True  # to accomidate force pushes
+        if 'update_' not in sync_metafields['job_tags']:
+            sync_metafields['scm_revision'] = project.scm_revision
+        local_project_sync = project.create_project_update(_eager_fields=sync_metafields)
+        local_project_sync.log_lifecycle("controller_node_chosen")
+        local_project_sync.log_lifecycle("execution_node_chosen")
+        return local_project_sync
+
+    def sync_and_copy_without_lock(self, project, private_data_dir, scm_branch=None):
+        sync_needs = self.get_sync_needs(project, scm_branch=scm_branch)
+
+        if sync_needs:
+            local_project_sync = self.spawn_project_sync(project, sync_needs, scm_branch=scm_branch)
+            # save the associated job before calling run() so that a
+            # cancel() call on the job can cancel the project update
+            if isinstance(self.instance, Job):
+                self.instance = self.update_model(self.instance.pk, project_update=local_project_sync)
+            else:
+                self.instance = self.update_model(self.instance.pk, source_project_update=local_project_sync)
+
+            try:
+                # the job private_data_dir is passed so sync can download roles and collections there
+                sync_task = RunProjectUpdate(job_private_data_dir=private_data_dir)
+                sync_task.run(local_project_sync.id)
+                local_project_sync.refresh_from_db()
+                if isinstance(self.instance, Job):
+                    self.instance = self.update_model(self.instance.pk, scm_revision=local_project_sync.scm_revision)
+            except Exception:
+                local_project_sync.refresh_from_db()
+                if local_project_sync.status != 'canceled':
+                    self.instance = self.update_model(
+                        self.instance.pk,
+                        status='failed',
+                        job_explanation=(
+                            'Previous Task Failed: {"job_type": "project_update", '
+                            f'"job_name": "{local_project_sync.name}", "job_id": "{local_project_sync.id}"}}'
+                        ),
+                    )
+                    raise
+                self.instance.refresh_from_db()
+                if self.instance.cancel_flag:
+                    return
+        else:
+            # Case where a local sync is not needed, meaning that local tree is
+            # up-to-date with project, job is running project current version
+            if isinstance(self.instance, Job):
+                self.instance = self.update_model(self.instance.pk, scm_revision=project.scm_revision)
+            # Project update does not copy the folder, so copy here
+            RunProjectUpdate.make_local_copy(project, private_data_dir)
+
+    def sync_and_copy(self, project, private_data_dir, scm_branch=None):
+        self.acquire_lock(project, self.instance.id)
+
+        try:
+            original_branch = None
+            project_path = project.get_project_path(check_if_exists=False)
+            if project.scm_type == 'git' and (scm_branch and scm_branch != project.scm_branch):
+                if os.path.exists(project_path):
+                    git_repo = git.Repo(project_path)
+                    if git_repo.head.is_detached:
+                        original_branch = git_repo.head.commit
+                    else:
+                        original_branch = git_repo.active_branch
+
+            return self.sync_and_copy_without_lock(project, private_data_dir, scm_branch=scm_branch)
+        finally:
+            # We have made the copy so we can set the tree back to its normal state
+            if original_branch:
+                # for git project syncs, non-default branches can be problems
+                # restore to branch the repo was on before this run
+                try:
+                    original_branch.checkout()
+                except Exception:
+                    # this could have failed due to dirty tree, but difficult to predict all cases
+                    logger.exception(f'Failed to restore project repo to prior state after {self.instance.id}')
+
+            self.release_lock(project)
+
+
@task(queue=get_local_queuename)
-class RunJob(BaseTask):
+class RunJob(SourceControlMixin, BaseTask):
    """
    Run a job using ansible-playbook.
    """
@@ -863,98 +1063,14 @@ class RunJob(BaseTask):
            job = self.update_model(job.pk, status='failed', job_explanation=msg)
            raise RuntimeError(msg)

-        project_path = job.project.get_project_path(check_if_exists=False)
-        job_revision = job.project.scm_revision
-        sync_needs = []
-        source_update_tag = 'update_{}'.format(job.project.scm_type)
-        branch_override = bool(job.scm_branch and job.scm_branch != job.project.scm_branch)
-        if not job.project.scm_type:
-            pass  # manual projects are not synced, user has responsibility for that
-        elif not os.path.exists(project_path):
-            logger.debug('Performing fresh clone of {} on this instance.'.format(job.project))
-            sync_needs.append(source_update_tag)
-        elif job.project.scm_type == 'git' and job.project.scm_revision and (not branch_override):
-            try:
-                git_repo = git.Repo(project_path)
-
-                if job_revision == git_repo.head.commit.hexsha:
-                    logger.debug('Skipping project sync for {} because commit is locally available'.format(job.log_format))
-                else:
-                    sync_needs.append(source_update_tag)
-            except (ValueError, BadGitName, git.exc.InvalidGitRepositoryError):
-                logger.debug('Needed commit for {} not in local source tree, will sync with remote'.format(job.log_format))
-                sync_needs.append(source_update_tag)
-        else:
-            logger.debug('Project not available locally, {} will sync with remote'.format(job.log_format))
-            sync_needs.append(source_update_tag)
-
-        has_cache = os.path.exists(os.path.join(job.project.get_cache_path(), job.project.cache_id))
-        # Galaxy requirements are not supported for manual projects
-        if job.project.scm_type and ((not has_cache) or branch_override):
-            sync_needs.extend(['install_roles', 'install_collections'])
-
-        if sync_needs:
-            pu_ig = job.instance_group
-            pu_en = Instance.objects.me().hostname
-
-            sync_metafields = dict(
-                launch_type="sync",
-                job_type='run',
-                job_tags=','.join(sync_needs),
-                status='running',
-                instance_group=pu_ig,
-                execution_node=pu_en,
-                controller_node=pu_en,
-                celery_task_id=job.celery_task_id,
-            )
-            if branch_override:
-                sync_metafields['scm_branch'] = job.scm_branch
-                sync_metafields['scm_clean'] = True  # to accomidate force pushes
-            if 'update_' not in sync_metafields['job_tags']:
-                sync_metafields['scm_revision'] = job_revision
-            local_project_sync = job.project.create_project_update(_eager_fields=sync_metafields)
-            local_project_sync.log_lifecycle("controller_node_chosen")
-            local_project_sync.log_lifecycle("execution_node_chosen")
-            create_partition(local_project_sync.event_class._meta.db_table, start=local_project_sync.created)
-            # save the associated job before calling run() so that a
-            # cancel() call on the job can cancel the project update
-            job = self.update_model(job.pk, project_update=local_project_sync)
-
-            project_update_task = local_project_sync._get_task_class()
-            try:
-                # the job private_data_dir is passed so sync can download roles and collections there
-                sync_task = project_update_task(job_private_data_dir=private_data_dir)
-                sync_task.run(local_project_sync.id)
-                local_project_sync.refresh_from_db()
-                job = self.update_model(job.pk, scm_revision=local_project_sync.scm_revision)
-            except Exception:
-                local_project_sync.refresh_from_db()
-                if local_project_sync.status != 'canceled':
-                    job = self.update_model(
-                        job.pk,
-                        status='failed',
-                        job_explanation=(
-                            'Previous Task Failed: {"job_type": "%s", "job_name": "%s", "job_id": "%s"}'
-                            % ('project_update', local_project_sync.name, local_project_sync.id)
-                        ),
-                    )
-                    raise
-                job.refresh_from_db()
-                if job.cancel_flag:
-                    return
-        else:
-            # Case where a local sync is not needed, meaning that local tree is
-            # up-to-date with project, job is running project current version
-            if job_revision:
-                job = self.update_model(job.pk, scm_revision=job_revision)
-            # Project update does not copy the folder, so copy here
-            RunProjectUpdate.make_local_copy(job.project, private_data_dir, scm_revision=job_revision)
-
        if job.inventory.kind == 'smart':
            # cache smart inventory memberships so that the host_filter query is not
            # ran inside of the event saving code
            update_smart_memberships_for_inventory(job.inventory)

+    def build_project_dir(self, job, private_data_dir):
+        self.sync_and_copy(job.project, private_data_dir, scm_branch=job.scm_branch)
+
    def final_run_hook(self, job, status, private_data_dir, fact_modification_times):
        super(RunJob, self).final_run_hook(job, status, private_data_dir, fact_modification_times)
        if not private_data_dir:
@@ -986,7 +1102,6 @@ class RunProjectUpdate(BaseTask):

    def __init__(self, *args, job_private_data_dir=None, **kwargs):
        super(RunProjectUpdate, self).__init__(*args, **kwargs)
-        self.original_branch = None
        self.job_private_data_dir = job_private_data_dir

    def build_private_data(self, project_update, private_data_dir):
@@ -1156,6 +1271,10 @@ class RunProjectUpdate(BaseTask):
            # for raw archive, prevent error moving files between volumes
            extra_vars['ansible_remote_tmp'] = os.path.join(project_update.get_project_path(check_if_exists=False), '.ansible_awx', 'tmp')

+        if project_update.project.signature_validation_credential is not None:
+            pubkey = project_update.project.signature_validation_credential.get_input('gpg_public_key')
+            extra_vars['gpg_pubkey'] = pubkey
+
        self._write_extra_vars_file(private_data_dir, extra_vars)

    def build_playbook_path_relative_to_cwd(self, project_update, private_data_dir):
@@ -1173,74 +1292,13 @@ class RunProjectUpdate(BaseTask):
        d[r'^Are you sure you want to continue connecting \(yes/no\)\?\s*?$'] = 'yes'
        return d

-    def release_lock(self, instance):
-        try:
-            fcntl.lockf(self.lock_fd, fcntl.LOCK_UN)
-        except IOError as e:
-            logger.error("I/O error({0}) while trying to release lock file [{1}]: {2}".format(e.errno, instance.get_lock_file(), e.strerror))
-            os.close(self.lock_fd)
-            raise
-
-        os.close(self.lock_fd)
-        self.lock_fd = None
-
-    '''
-    Note: We don't support blocking=False
-    '''
-
-    def acquire_lock(self, instance, blocking=True):
-        lock_path = instance.get_lock_file()
-        if lock_path is None:
-            # If from migration or someone blanked local_path for any other reason, recoverable by save
-            instance.save()
-            lock_path = instance.get_lock_file()
-            if lock_path is None:
-                raise RuntimeError(u'Invalid lock file path')
-
-        try:
-            self.lock_fd = os.open(lock_path, os.O_RDWR | os.O_CREAT)
-        except OSError as e:
-            logger.error("I/O error({0}) while trying to open lock file [{1}]: {2}".format(e.errno, lock_path, e.strerror))
-            raise
-
-        start_time = time.time()
-        while True:
-            try:
-                instance.refresh_from_db(fields=['cancel_flag'])
-                if instance.cancel_flag:
-                    logger.debug("ProjectUpdate({0}) was canceled".format(instance.pk))
-                    return
-                fcntl.lockf(self.lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
-                break
-            except IOError as e:
-                if e.errno not in (errno.EAGAIN, errno.EACCES):
-                    os.close(self.lock_fd)
-                    logger.error("I/O error({0}) while trying to aquire lock on file [{1}]: {2}".format(e.errno, lock_path, e.strerror))
-                    raise
-                else:
-                    time.sleep(1.0)
-        waiting_time = time.time() - start_time
-
-        if waiting_time > 1.0:
-            logger.info('{} spent {} waiting to acquire lock for local source tree ' 'for path {}.'.format(instance.log_format, waiting_time, lock_path))
-
    def pre_run_hook(self, instance, private_data_dir):
        super(RunProjectUpdate, self).pre_run_hook(instance, private_data_dir)
        # re-create root project folder if a natural disaster has destroyed it
-        if not os.path.exists(settings.PROJECTS_ROOT):
-            os.mkdir(settings.PROJECTS_ROOT)
        project_path = instance.project.get_project_path(check_if_exists=False)

-        self.acquire_lock(instance)
-
-        self.original_branch = None
-        if instance.scm_type == 'git' and instance.branch_override:
-            if os.path.exists(project_path):
-                git_repo = git.Repo(project_path)
-                if git_repo.head.is_detached:
-                    self.original_branch = git_repo.head.commit
-                else:
-                    self.original_branch = git_repo.active_branch
+        if instance.launch_type != 'sync':
+            self.acquire_lock(instance.project, instance.id)

        if not os.path.exists(project_path):
            os.makedirs(project_path)  # used as container mount
@@ -1251,11 +1309,12 @@ class RunProjectUpdate(BaseTask):
            shutil.rmtree(stage_path)
        os.makedirs(stage_path)  # presence of empty cache indicates lack of roles or collections

+    def build_project_dir(self, instance, private_data_dir):
        # the project update playbook is not in a git repo, but uses a vendoring directory
        # to be consistent with the ansible-runner model,
        # that is moved into the runner project folder here
        awx_playbooks = self.get_path_to('../../', 'playbooks')
-        copy_tree(awx_playbooks, os.path.join(private_data_dir, 'project'))
+        shutil.copytree(awx_playbooks, os.path.join(private_data_dir, 'project'))

    @staticmethod
    def clear_project_cache(cache_dir, keep_value):
@@ -1272,50 +1331,18 @@ class RunProjectUpdate(BaseTask):
                        logger.warning(f"Could not remove cache directory {old_path}")

    @staticmethod
-    def make_local_copy(p, job_private_data_dir, scm_revision=None):
+    def make_local_copy(project, job_private_data_dir):
        """Copy project content (roles and collections) to a job private_data_dir

-        :param object p: Either a project or a project update
+        :param object project: Either a project or a project update
        :param str job_private_data_dir: The root of the target ansible-runner folder
-        :param str scm_revision: For branch_override cases, the git revision to copy
        """
-        project_path = p.get_project_path(check_if_exists=False)
+        project_path = project.get_project_path(check_if_exists=False)
        destination_folder = os.path.join(job_private_data_dir, 'project')
-        if not scm_revision:
-            scm_revision = p.scm_revision
-
-        if p.scm_type == 'git':
-            git_repo = git.Repo(project_path)
-            if not os.path.exists(destination_folder):
-                os.mkdir(destination_folder, stat.S_IREAD | stat.S_IWRITE | stat.S_IEXEC)
-            tmp_branch_name = 'awx_internal/{}'.format(uuid4())
-            # always clone based on specific job revision
-            if not p.scm_revision:
-                raise RuntimeError('Unexpectedly could not determine a revision to run from project.')
-            source_branch = git_repo.create_head(tmp_branch_name, p.scm_revision)
-            # git clone must take file:// syntax for source repo or else options like depth will be ignored
-            source_as_uri = Path(project_path).as_uri()
-            git.Repo.clone_from(
-                source_as_uri,
-                destination_folder,
-                branch=source_branch,
-                depth=1,
-                single_branch=True,  # shallow, do not copy full history
-            )
-            # submodules copied in loop because shallow copies from local HEADs are ideal
-            # and no git clone submodule options are compatible with minimum requirements
-            for submodule in git_repo.submodules:
-                subrepo_path = os.path.abspath(os.path.join(project_path, submodule.path))
-                subrepo_destination_folder = os.path.abspath(os.path.join(destination_folder, submodule.path))
-                subrepo_uri = Path(subrepo_path).as_uri()
-                git.Repo.clone_from(subrepo_uri, subrepo_destination_folder, depth=1, single_branch=True)
-            # force option is necessary because remote refs are not counted, although no information is lost
-            git_repo.delete_head(tmp_branch_name, force=True)
-        else:
-            copy_tree(project_path, destination_folder, preserve_symlinks=1)
+        shutil.copytree(project_path, destination_folder, ignore=shutil.ignore_patterns('.git'), symlinks=True)

        # copy over the roles and collection cache to job folder
-        cache_path = os.path.join(p.get_cache_path(), p.cache_id)
+        cache_path = os.path.join(project.get_cache_path(), project.cache_id)
        subfolders = []
        if settings.AWX_COLLECTIONS_ENABLED:
            subfolders.append('requirements_collections')
@@ -1325,8 +1352,8 @@ class RunProjectUpdate(BaseTask):
            cache_subpath = os.path.join(cache_path, subfolder)
            if os.path.exists(cache_subpath):
                dest_subpath = os.path.join(job_private_data_dir, subfolder)
-                copy_tree(cache_subpath, dest_subpath, preserve_symlinks=1)
-                logger.debug('{0} {1} prepared {2} from cache'.format(type(p).__name__, p.pk, dest_subpath))
+                shutil.copytree(cache_subpath, dest_subpath, symlinks=True)
+                logger.debug('{0} {1} prepared {2} from cache'.format(type(project).__name__, project.pk, dest_subpath))

    def post_run_hook(self, instance, status):
        super(RunProjectUpdate, self).post_run_hook(instance, status)
@@ -1356,23 +1383,13 @@ class RunProjectUpdate(BaseTask):
            if self.job_private_data_dir:
                if status == 'successful':
                    # copy project folder before resetting to default branch
-                    # because some git-tree-specific resources (like submodules) might matter
                    self.make_local_copy(instance, self.job_private_data_dir)
-                if self.original_branch:
-                    # for git project syncs, non-default branches can be problems
-                    # restore to branch the repo was on before this run
-                    try:
-                        self.original_branch.checkout()
-                    except Exception:
-                        # this could have failed due to dirty tree, but difficult to predict all cases
-                        logger.exception('Failed to restore project repo to prior state after {}'.format(instance.log_format))
        finally:
-            self.release_lock(instance)
+            if instance.launch_type != 'sync':
+                self.release_lock(instance.project)
+
        p = instance.project
-        if instance.job_type == 'check' and status not in (
-            'failed',
-            'canceled',
-        ):
+        if instance.job_type == 'check' and status not in ('failed', 'canceled'):
            if self.runner_callback.playbook_new_revision:
                p.scm_revision = self.runner_callback.playbook_new_revision
            else:
@@ -1400,7 +1417,7 @@ class RunProjectUpdate(BaseTask):


@task(queue=get_local_queuename)
-class RunInventoryUpdate(BaseTask):
+class RunInventoryUpdate(SourceControlMixin, BaseTask):

    model = InventoryUpdate
    event_model = InventoryUpdateEvent
@@ -1556,54 +1573,18 @@ class RunInventoryUpdate(BaseTask):
        # All credentials not used by inventory source injector
        return inventory_update.get_extra_credentials()

-    def pre_run_hook(self, inventory_update, private_data_dir):
-        super(RunInventoryUpdate, self).pre_run_hook(inventory_update, private_data_dir)
+    def build_project_dir(self, inventory_update, private_data_dir):
        source_project = None
        if inventory_update.inventory_source:
            source_project = inventory_update.inventory_source.source_project
-        if inventory_update.source == 'scm' and source_project and source_project.scm_type:  # never ever update manual projects

-            # Check if the content cache exists, so that we do not unnecessarily re-download roles
-            sync_needs = ['update_{}'.format(source_project.scm_type)]
-            has_cache = os.path.exists(os.path.join(source_project.get_cache_path(), source_project.cache_id))
-            # Galaxy requirements are not supported for manual projects
-            if not has_cache:
-                sync_needs.extend(['install_roles', 'install_collections'])
-
-            local_project_sync = source_project.create_project_update(
-                _eager_fields=dict(
-                    launch_type="sync",
-                    job_type='run',
-                    job_tags=','.join(sync_needs),
-                    status='running',
-                    execution_node=Instance.objects.me().hostname,
-                    controller_node=Instance.objects.me().hostname,
-                    instance_group=inventory_update.instance_group,
-                    celery_task_id=inventory_update.celery_task_id,
-                )
-            )
-            local_project_sync.log_lifecycle("controller_node_chosen")
-            local_project_sync.log_lifecycle("execution_node_chosen")
-            create_partition(local_project_sync.event_class._meta.db_table, start=local_project_sync.created)
-            # associate the inventory update before calling run() so that a
-            # cancel() call on the inventory update can cancel the project update
-            local_project_sync.scm_inventory_updates.add(inventory_update)
-
-            project_update_task = local_project_sync._get_task_class()
-            try:
-                sync_task = project_update_task(job_private_data_dir=private_data_dir)
-                sync_task.run(local_project_sync.id)
-                local_project_sync.refresh_from_db()
-            except Exception:
-                inventory_update = self.update_model(
-                    inventory_update.pk,
-                    status='failed',
-                    job_explanation=(
-                        'Previous Task Failed: {"job_type": "%s", "job_name": "%s", "job_id": "%s"}'
-                        % ('project_update', local_project_sync.name, local_project_sync.id)
-                    ),
-                )
-                raise
+        if inventory_update.source == 'scm':
+            if not source_project:
+                raise RuntimeError('Could not find project to run SCM inventory update from.')
+            self.sync_and_copy(source_project, private_data_dir)
+        else:
+            # If source is not SCM make an empty project directory, content is built inside inventory folder
+            super(RunInventoryUpdate, self).build_project_dir(inventory_update, private_data_dir)

    def post_run_hook(self, inventory_update, status):
        super(RunInventoryUpdate, self).post_run_hook(inventory_update, status)
@@ -1646,7 +1627,7 @@ class RunInventoryUpdate(BaseTask):

        handler = SpecialInventoryHandler(
            self.runner_callback.event_handler,
-            self.runner_callback.cancel_callback,
+            signal_callback,
            verbosity=inventory_update.verbosity,
            job_timeout=self.get_instance_timeout(self.instance),
            start_time=inventory_update.started,
--- a/awx/main/tasks/receptor.py
+++ b/awx/main/tasks/receptor.py
@@ -12,6 +12,7 @@ import yaml

 # Django
 from django.conf import settings
+from django.db import connections

 # Runner
 import ansible_runner
@@ -25,6 +26,7 @@ from awx.main.utils.common import (
    cleanup_new_process,
 )
 from awx.main.constants import MAX_ISOLATED_PATH_COLON_DELIMITER
+from awx.main.tasks.signals import signal_state, signal_callback, SignalExit

 # Receptorctl
 from receptorctl.socket_interface import ReceptorControl
@@ -99,16 +101,22 @@ def administrative_workunit_reaper(work_list=None):

    for unit_id, work_data in work_list.items():
        extra_data = work_data.get('ExtraData')
-        if (extra_data is None) or (extra_data.get('RemoteWorkType') != 'ansible-runner'):
+        if extra_data is None:
            continue  # if this is not ansible-runner work, we do not want to touch it
-        params = extra_data.get('RemoteParams', {}).get('params')
-        if not params:
-            continue
-        if not (params == '--worker-info' or params.startswith('cleanup')):
-            continue  # if this is not a cleanup or health check, we do not want to touch it
-        if work_data.get('StateName') in RECEPTOR_ACTIVE_STATES:
-            continue  # do not want to touch active work units
-        logger.info(f'Reaping orphaned work unit {unit_id} with params {params}')
+        if isinstance(extra_data, str):
+            if not work_data.get('StateName', None) or work_data.get('StateName') in RECEPTOR_ACTIVE_STATES:
+                continue
+        else:
+            if extra_data.get('RemoteWorkType') != 'ansible-runner':
+                continue
+            params = extra_data.get('RemoteParams', {}).get('params')
+            if not params:
+                continue
+            if not (params == '--worker-info' or params.startswith('cleanup')):
+                continue  # if this is not a cleanup or health check, we do not want to touch it
+            if work_data.get('StateName') in RECEPTOR_ACTIVE_STATES:
+                continue  # do not want to touch active work units
+            logger.info(f'Reaping orphaned work unit {unit_id} with params {params}')
        receptor_ctl.simple_command(f"work release {unit_id}")


@@ -329,24 +337,32 @@ class AWXReceptorJob:
            shutil.rmtree(artifact_dir)

        resultsock, resultfile = receptor_ctl.get_work_results(self.unit_id, return_socket=True, return_sockfile=True)
-        # Both "processor" and "cancel_watcher" are spawned in separate threads.
-        # We wait for the first one to return. If cancel_watcher returns first,
-        # we yank the socket out from underneath the processor, which will cause it
-        # to exit. A reference to the processor_future is passed into the cancel_watcher_future,
-        # Which exits if the job has finished normally. The context manager ensures we do not
-        # leave any threads laying around.
-        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
-            processor_future = executor.submit(self.processor, resultfile)
-            cancel_watcher_future = executor.submit(self.cancel_watcher, processor_future)
-            futures = [processor_future, cancel_watcher_future]
-            first_future = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED)

-            res = list(first_future.done)[0].result()
-            if res.status == 'canceled':
+        connections.close_all()
+
+        # "processor" and the main thread will be separate threads.
+        # If a cancel happens, the main thread will encounter an exception, in which case
+        # we yank the socket out from underneath the processor, which will cause it to exit.
+        # The ThreadPoolExecutor context manager ensures we do not leave any threads laying around.
+        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+            processor_future = executor.submit(self.processor, resultfile)
+
+            try:
+                signal_state.raise_exception = True
+                # address race condition where SIGTERM was issued after this dispatcher task started
+                if signal_callback():
+                    raise SignalExit()
+                res = processor_future.result()
+            except SignalExit:
                receptor_ctl.simple_command(f"work cancel {self.unit_id}")
                resultsock.shutdown(socket.SHUT_RDWR)
                resultfile.close()
-            elif res.status == 'error':
+                result = namedtuple('result', ['status', 'rc'])
+                res = result('canceled', 1)
+            finally:
+                signal_state.raise_exception = False
+
+            if res.status == 'error':
                # If ansible-runner ran, but an error occured at runtime, the traceback information
                # is saved via the status_handler passed in to the processor.
                if 'result_traceback' in self.task.runner_callback.extra_update_fields:
@@ -440,18 +456,6 @@ class AWXReceptorJob:
            return 'local'
        return 'ansible-runner'

-    @cleanup_new_process
-    def cancel_watcher(self, processor_future):
-        while True:
-            if processor_future.done():
-                return processor_future.result()
-
-            if self.task.runner_callback.cancel_callback():
-                result = namedtuple('result', ['status', 'rc'])
-                return result('canceled', 1)
-
-            time.sleep(1)
-
    @property
    def pod_definition(self):
        ee = self.task.instance.execution_environment
--- a/awx/main/tasks/signals.py
+++ b/awx/main/tasks/signals.py
@@ -9,12 +9,17 @@ logger = logging.getLogger('awx.main.tasks.signals')
 __all__ = ['with_signal_handling', 'signal_callback']


+class SignalExit(Exception):
+    pass
+
+
 class SignalState:
    def reset(self):
        self.sigterm_flag = False
        self.is_active = False
        self.original_sigterm = None
        self.original_sigint = None
+        self.raise_exception = False

    def __init__(self):
        self.reset()
@@ -22,6 +27,9 @@ class SignalState:
    def set_flag(self, *args):
        """Method to pass into the python signal.signal method to receive signals"""
        self.sigterm_flag = True
+        if self.raise_exception:
+            self.raise_exception = False  # so it is not raised a second time in error handling
+            raise SignalExit()

    def connect_signals(self):
        self.original_sigterm = signal.getsignal(signal.SIGTERM)
--- a/awx/main/tasks/system.py
+++ b/awx/main/tasks/system.py
@@ -10,12 +10,13 @@ from contextlib import redirect_stdout
 import shutil
 import time
 from distutils.version import LooseVersion as Version
+from datetime import datetime

 # Django
 from django.conf import settings
 from django.db import transaction, DatabaseError, IntegrityError
 from django.db.models.fields.related import ForeignKey
-from django.utils.timezone import now
+from django.utils.timezone import now, timedelta
 from django.utils.encoding import smart_str
 from django.contrib.auth.models import User
 from django.utils.translation import gettext_lazy as _
@@ -53,7 +54,8 @@ from awx.main.dispatch import get_local_queuename, reaper
 from awx.main.utils.common import (
    ignore_inventory_computed_fields,
    ignore_inventory_group_removal,
-    schedule_task_manager,
+    ScheduleWorkflowManager,
+    ScheduleTaskManager,
 )

 from awx.main.utils.external_logging import reconfigure_rsyslog
@@ -103,6 +105,8 @@ def dispatch_startup():
    #
    apply_cluster_membership_policies()
    cluster_node_heartbeat()
+    reaper.startup_reaping()
+    reaper.reap_waiting(grace_period=0)
    m = Metrics()
    m.reset_values()

@@ -114,7 +118,11 @@ def inform_cluster_of_shutdown():
    try:
        this_inst = Instance.objects.get(hostname=settings.CLUSTER_HOST_ID)
        this_inst.mark_offline(update_last_seen=True, errors=_('Instance received normal shutdown signal'))
-        logger.warning('Normal shutdown signal for instance {}, removed self from capacity pool.'.format(this_inst.hostname))
+        try:
+            reaper.reap_waiting(this_inst, grace_period=0)
+        except Exception:
+            logger.exception('failed to reap waiting jobs for {}'.format(this_inst.hostname))
+        logger.warning('Normal shutdown signal for instance {}, ' 'removed self from capacity pool.'.format(this_inst.hostname))
    except Exception:
        logger.exception('Encountered problem with normal shutdown signal.')

@@ -341,13 +349,9 @@ def _cleanup_images_and_files(**kwargs):
            logger.info(f'Performed local cleanup with kwargs {kwargs}, output:\n{stdout}')

    # if we are the first instance alphabetically, then run cleanup on execution nodes
-    checker_instance = (
-        Instance.objects.filter(node_type__in=['hybrid', 'control'], node_state=Instance.States.READY, enabled=True, capacity__gt=0)
-        .order_by('-hostname')
-        .first()
-    )
+    checker_instance = Instance.objects.filter(node_type__in=['hybrid', 'control'], enabled=True, capacity__gt=0).order_by('-hostname').first()
    if checker_instance and this_inst.hostname == checker_instance.hostname:
-        for inst in Instance.objects.filter(node_type='execution', node_state=Instance.States.READY, enabled=True, capacity__gt=0):
+        for inst in Instance.objects.filter(node_type='execution', enabled=True, capacity__gt=0):
            runner_cleanup_kwargs = inst.get_cleanup_task_kwargs(**kwargs)
            if not runner_cleanup_kwargs:
                continue
@@ -403,9 +407,6 @@ def execution_node_health_check(node):
    if instance.node_type != 'execution':
        raise RuntimeError(f'Execution node health check ran against {instance.node_type} node {instance.hostname}')

-    if instance.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
-        raise RuntimeError(f"Execution node health check ran against node {instance.hostname} in state {instance.node_state}")
-
    data = worker_info(node)

    prior_capacity = instance.capacity
@@ -439,7 +440,6 @@ def inspect_execution_nodes(instance_list):

        nowtime = now()
        workers = mesh_status['Advertisements']
-
        for ad in workers:
            hostname = ad['NodeID']

@@ -453,7 +453,9 @@ def inspect_execution_nodes(instance_list):
            if instance.node_type in ('control', 'hybrid'):
                continue

+            was_lost = instance.is_lost(ref_time=nowtime)
            last_seen = parse_date(ad['Time'])
+
            if instance.last_seen and instance.last_seen >= last_seen:
                continue
            instance.last_seen = last_seen
@@ -461,12 +463,12 @@ def inspect_execution_nodes(instance_list):

            # Only execution nodes should be dealt with by execution_node_health_check
            if instance.node_type == 'hop':
-                if instance.node_state in (Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
+                if was_lost and (not instance.is_lost(ref_time=nowtime)):
                    logger.warning(f'Hop node {hostname}, has rejoined the receptor mesh')
                    instance.save_health_data(errors='')
                continue

-            if instance.node_state in (Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
+            if was_lost:
                # if the instance *was* lost, but has appeared again,
                # attempt to re-establish the initial capacity and version
                # check
@@ -481,11 +483,11 @@ def inspect_execution_nodes(instance_list):
                    execution_node_health_check.apply_async([hostname])


-@task(queue=get_local_queuename)
-def cluster_node_heartbeat():
+@task(queue=get_local_queuename, bind_kwargs=['dispatch_time', 'worker_tasks'])
+def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None):
    logger.debug("Cluster node heartbeat task.")
    nowtime = now()
-    instance_list = list(Instance.objects.filter(node_state__in=(Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED)))
+    instance_list = list(Instance.objects.all())
    this_inst = None
    lost_instances = []

@@ -505,12 +507,23 @@ def cluster_node_heartbeat():

    if this_inst:
        startup_event = this_inst.is_lost(ref_time=nowtime)
+        last_last_seen = this_inst.last_seen
        this_inst.local_health_check()
        if startup_event and this_inst.capacity != 0:
-            logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname))
+            logger.warning(f'Rejoining the cluster as instance {this_inst.hostname}. Prior last_seen {last_last_seen}')
            return
+        elif not last_last_seen:
+            logger.warning(f'Instance does not have recorded last_seen, updating to {nowtime}')
+        elif (nowtime - last_last_seen) > timedelta(seconds=settings.CLUSTER_NODE_HEARTBEAT_PERIOD + 2):
+            logger.warning(f'Heartbeat skew - interval={(nowtime - last_last_seen).total_seconds():.4f}, expected={settings.CLUSTER_NODE_HEARTBEAT_PERIOD}')
    else:
-        raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
+        if settings.AWX_AUTO_DEPROVISION_INSTANCES:
+            (changed, this_inst) = Instance.objects.register(ip_address=os.environ.get('MY_POD_IP'), node_type='control', uuid=settings.SYSTEM_UUID)
+            if changed:
+                logger.warning(f'Recreated instance record {this_inst.hostname} after unexpected removal')
+            this_inst.local_health_check()
+        else:
+            raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
    # IFF any node has a greater version than we do, then we'll shutdown services
    for other_inst in instance_list:
        if other_inst.node_type in ('execution', 'hop'):
@@ -530,15 +543,17 @@ def cluster_node_heartbeat():

    for other_inst in lost_instances:
        try:
-            reaper.reap(other_inst)
+            explanation = "Job reaped due to instance shutdown"
+            reaper.reap(other_inst, job_explanation=explanation)
+            reaper.reap_waiting(other_inst, grace_period=0, job_explanation=explanation)
        except Exception:
            logger.exception('failed to reap jobs for {}'.format(other_inst.hostname))
        try:
            if settings.AWX_AUTO_DEPROVISION_INSTANCES:
                deprovision_hostname = other_inst.hostname
-                other_inst.delete()  # FIXME: what about associated inbound links?
+                other_inst.delete()
                logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname))
-            elif other_inst.node_state == Instance.States.READY:
+            elif other_inst.capacity != 0 or (not other_inst.errors):
                other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))
                logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen))

@@ -548,6 +563,15 @@ def cluster_node_heartbeat():
            else:
                logger.exception('Error marking {} as lost'.format(other_inst.hostname))

+    # Run local reaper
+    if worker_tasks is not None:
+        active_task_ids = []
+        for task_list in worker_tasks.values():
+            active_task_ids.extend(task_list)
+        reaper.reap(instance=this_inst, excluded_uuids=active_task_ids)
+        if max(len(task_list) for task_list in worker_tasks.values()) <= 1:
+            reaper.reap_waiting(instance=this_inst, excluded_uuids=active_task_ids, ref_time=datetime.fromisoformat(dispatch_time))
+

@task(queue=get_local_queuename)
 def awx_receptor_workunit_reaper():
@@ -595,7 +619,8 @@ def awx_k8s_reaper():
    for group in InstanceGroup.objects.filter(is_container_group=True).iterator():
        logger.debug("Checking for orphaned k8s pods for {}.".format(group))
        pods = PodManager.list_active_jobs(group)
-        for job in UnifiedJob.objects.filter(pk__in=pods.keys()).exclude(status__in=ACTIVE_STATES):
+        time_cutoff = now() - timedelta(seconds=settings.K8S_POD_REAPER_GRACE_PERIOD)
+        for job in UnifiedJob.objects.filter(pk__in=pods.keys(), finished__lte=time_cutoff).exclude(status__in=ACTIVE_STATES):
            logger.debug('{} is no longer active, reaping orphaned k8s pod'.format(job.log_format))
            try:
                pm = PodManager(job)
@@ -663,6 +688,13 @@ def awx_periodic_scheduler():
        state.save()


+def schedule_manager_success_or_error(instance):
+    if instance.unifiedjob_blocked_jobs.exists():
+        ScheduleTaskManager().schedule()
+    if instance.spawned_by_workflow:
+        ScheduleWorkflowManager().schedule()
+
+
@task(queue=get_local_queuename)
 def handle_work_success(task_actual):
    try:
@@ -672,8 +704,7 @@ def handle_work_success(task_actual):
        return
    if not instance:
        return
-
-    schedule_task_manager()
+    schedule_manager_success_or_error(instance)


@task(queue=get_local_queuename)
@@ -715,8 +746,7 @@ def handle_work_error(task_id, *args, **kwargs):
    # what the job complete message handler does then we may want to send a
    # completion event for each job here.
    if first_instance:
-        schedule_task_manager()
-        pass
+        schedule_manager_success_or_error(first_instance)


@task(queue=get_local_queuename)
--- a/awx/main/tests/functional/api/test_workflow_node.py
+++ b/awx/main/tests/functional/api/test_workflow_node.py
@@ -13,7 +13,10 @@ from awx.main.models.workflow import (
    WorkflowJobTemplateNode,
 )
 from awx.main.models.credential import Credential
-from awx.main.scheduler import TaskManager
+from awx.main.scheduler import TaskManager, WorkflowManager, DependencyManager
+
+# Django
+from django.utils.timezone import now, timedelta


@pytest.fixture
@@ -137,8 +140,9 @@ class TestApprovalNodes:
        post(url, {'name': 'Approve Test', 'description': '', 'timeout': 0}, user=admin_user, expect=201)
        post(reverse('api:workflow_job_template_launch', kwargs={'pk': wfjt.pk}), user=admin_user, expect=201)
        wf_job = WorkflowJob.objects.first()
+        DependencyManager().schedule()  # TODO: exclude workflows from this and delete line
        TaskManager().schedule()
-        TaskManager().schedule()
+        WorkflowManager().schedule()
        wfj_node = wf_job.workflow_nodes.first()
        approval = wfj_node.job
        assert approval.name == 'Approve Test'
@@ -162,8 +166,9 @@ class TestApprovalNodes:
        post(url, {'name': 'Deny Test', 'description': '', 'timeout': 0}, user=admin_user, expect=201)
        post(reverse('api:workflow_job_template_launch', kwargs={'pk': wfjt.pk}), user=admin_user, expect=201)
        wf_job = WorkflowJob.objects.first()
+        DependencyManager().schedule()  # TODO: exclude workflows from this and delete line
        TaskManager().schedule()
-        TaskManager().schedule()
+        WorkflowManager().schedule()
        wfj_node = wf_job.workflow_nodes.first()
        approval = wfj_node.job
        assert approval.name == 'Deny Test'
@@ -216,6 +221,37 @@ class TestApprovalNodes:
        approval.refresh_from_db()
        assert approval.status == 'failed'

+    def test_expires_time_on_creation(self):
+        now_time = now()
+        wa = WorkflowApproval.objects.create(timeout=34)
+        # this is fudged, so we assert that the expires time is in reasonable range
+        assert timedelta(seconds=33) < (wa.expires - now_time) < timedelta(seconds=35)
+
+    @pytest.mark.parametrize('with_update_fields', [True, False])
+    def test_expires_time_update(self, with_update_fields):
+        wa = WorkflowApproval.objects.create()
+        assert wa.timeout == 0
+        assert wa.expires is None
+        wa.timeout = 1234
+        if with_update_fields:
+            wa.save(update_fields=['timeout'])
+        else:
+            wa.save()
+        assert wa.created + timedelta(seconds=1234) == wa.expires
+
+    @pytest.mark.parametrize('with_update_fields', [True, False])
+    def test_reset_timeout_and_expires(self, with_update_fields):
+        wa = WorkflowApproval.objects.create()
+        wa.timeout = 1234
+        wa.save()
+        assert wa.expires
+        wa.timeout = 0
+        if with_update_fields:
+            wa.save(update_fields=['timeout'])
+        else:
+            wa.save()
+        assert wa.expires is None
+

@pytest.mark.django_db
 class TestExclusiveRelationshipEnforcement:
--- a/awx/main/tests/functional/models/test_base.py
+++ b/awx/main/tests/functional/models/test_base.py
@@ -0,0 +1,40 @@
+from unittest import mock
+import pytest
+
+from crum import impersonate
+
+from awx.main.models import Host
+
+
+@pytest.mark.django_db
+def test_modified_by_not_changed(inventory):
+    with impersonate(None):
+        host = Host.objects.create(name='foo', inventory=inventory)
+        assert host.modified_by == None
+        host.variables = {'foo': 'bar'}
+        with mock.patch('django.db.models.Model.save') as save_mock:
+            host.save(update_fields=['variables'])
+            save_mock.assert_called_once_with(update_fields=['variables'])
+
+
+@pytest.mark.django_db
+def test_modified_by_changed(inventory, alice):
+    with impersonate(None):
+        host = Host.objects.create(name='foo', inventory=inventory)
+        assert host.modified_by == None
+    with impersonate(alice):
+        host.variables = {'foo': 'bar'}
+        with mock.patch('django.db.models.Model.save') as save_mock:
+            host.save(update_fields=['variables'])
+            save_mock.assert_called_once_with(update_fields=['variables', 'modified_by'])
+        assert host.modified_by == alice
+
+
+@pytest.mark.django_db
+def test_created_by(inventory, alice):
+    with impersonate(alice):
+        host = Host.objects.create(name='foo', inventory=inventory)
+        assert host.created_by == alice
+    with impersonate(None):
+        host = Host.objects.create(name='bar', inventory=inventory)
+        assert host.created_by == None
--- a/awx/main/tests/functional/models/test_unified_job.py
+++ b/awx/main/tests/functional/models/test_unified_job.py
@@ -252,12 +252,14 @@ class TestTaskImpact:
    def test_limit_task_impact(self, job_host_limit, run_computed_fields_right_away):
        job = job_host_limit(5, 2)
        job.inventory.update_computed_fields()
+        job.task_impact = job._get_task_impact()
        assert job.inventory.total_hosts == 5
        assert job.task_impact == 2 + 1  # forks becomes constraint

    def test_host_task_impact(self, job_host_limit, run_computed_fields_right_away):
        job = job_host_limit(3, 5)
        job.inventory.update_computed_fields()
+        job.task_impact = job._get_task_impact()
        assert job.task_impact == 3 + 1  # hosts becomes constraint

    def test_shard_task_impact(self, slice_job_factory, run_computed_fields_right_away):
@@ -270,9 +272,13 @@ class TestTaskImpact:
        # Even distribution - all jobs run on 1 host
        assert [len(jobs[0].inventory.get_script_data(slice_number=i + 1, slice_count=3)['all']['hosts']) for i in range(3)] == [1, 1, 1]
        jobs[0].inventory.update_computed_fields()
+        for j in jobs:
+            j.task_impact = j._get_task_impact()
        assert [job.task_impact for job in jobs] == [2, 2, 2]  # plus one base task impact
        # Uneven distribution - first job takes the extra host
        jobs[0].inventory.hosts.create(name='remainder_foo')
        assert [len(jobs[0].inventory.get_script_data(slice_number=i + 1, slice_count=3)['all']['hosts']) for i in range(3)] == [2, 1, 1]
        jobs[0].inventory.update_computed_fields()
+        # recalculate task_impact
+        jobs[0].task_impact = jobs[0]._get_task_impact()
        assert [job.task_impact for job in jobs] == [3, 2, 2]
--- a/awx/main/tests/functional/task_management/init.py
+++ b/awx/main/tests/functional/task_management/init.py
@@ -0,0 +1,6 @@
+def create_job(jt, dependencies_processed=True):
+    job = jt.create_unified_job()
+    job.status = "pending"
+    job.dependencies_processed = dependencies_processed
+    job.save()
+    return job
--- a/awx/main/tests/functional/task_management/test_rampart_groups.py
+++ b/awx/main/tests/functional/task_management/test_rampart_groups.py
@@ -1,9 +1,10 @@
 import pytest
 from unittest import mock
 from datetime import timedelta
-from awx.main.scheduler import TaskManager
-from awx.main.models import InstanceGroup, WorkflowJob
+from awx.main.scheduler import TaskManager, DependencyManager
+from awx.main.models import InstanceGroup
 from awx.main.tasks.system import apply_cluster_membership_policies
+from . import create_job


@pytest.mark.django_db
@@ -12,16 +13,12 @@ def test_multi_group_basic_job_launch(instance_factory, controlplane_instance_gr
    i2 = instance_factory("i2")
    ig1 = instance_group_factory("ig1", instances=[i1])
    ig2 = instance_group_factory("ig2", instances=[i2])
-    objects1 = job_template_factory('jt1', organization='org1', project='proj1', inventory='inv1', credential='cred1', jobs=["job_should_start"])
+    objects1 = job_template_factory('jt1', organization='org1', project='proj1', inventory='inv1', credential='cred1')
    objects1.job_template.instance_groups.add(ig1)
-    j1 = objects1.jobs['job_should_start']
-    j1.status = 'pending'
-    j1.save()
-    objects2 = job_template_factory('jt2', organization='org2', project='proj2', inventory='inv2', credential='cred2', jobs=["job_should_still_start"])
+    j1 = create_job(objects1.job_template)
+    objects2 = job_template_factory('jt2', organization='org2', project='proj2', inventory='inv2', credential='cred2')
    objects2.job_template.instance_groups.add(ig2)
-    j2 = objects2.jobs['job_should_still_start']
-    j2.status = 'pending'
-    j2.save()
+    j2 = create_job(objects2.job_template)
    with mock.patch('awx.main.models.Job.task_impact', new_callable=mock.PropertyMock) as mock_task_impact:
        mock_task_impact.return_value = 500
        with mocker.patch("awx.main.scheduler.TaskManager.start_task"):
@@ -35,23 +32,26 @@ def test_multi_group_with_shared_dependency(instance_factory, controlplane_insta
    i2 = instance_factory("i2")
    ig1 = instance_group_factory("ig1", instances=[i1])
    ig2 = instance_group_factory("ig2", instances=[i2])
-    objects1 = job_template_factory('jt1', organization='org1', project='proj1', inventory='inv1', credential='cred1', jobs=["job_should_start"])
+    objects1 = job_template_factory(
+        'jt1',
+        organization='org1',
+        project='proj1',
+        inventory='inv1',
+        credential='cred1',
+    )
    objects1.job_template.instance_groups.add(ig1)
+    j1 = create_job(objects1.job_template, dependencies_processed=False)
    p = objects1.project
    p.scm_update_on_launch = True
    p.scm_update_cache_timeout = 0
    p.scm_type = "git"
    p.scm_url = "http://github.com/ansible/ansible.git"
    p.save()
-    j1 = objects1.jobs['job_should_start']
-    j1.status = 'pending'
-    j1.save()
-    objects2 = job_template_factory('jt2', organization=objects1.organization, project=p, inventory='inv2', credential='cred2', jobs=["job_should_still_start"])
+    objects2 = job_template_factory('jt2', organization=objects1.organization, project=p, inventory='inv2', credential='cred2')
    objects2.job_template.instance_groups.add(ig2)
-    j2 = objects2.jobs['job_should_still_start']
-    j2.status = 'pending'
-    j2.save()
+    j2 = create_job(objects2.job_template, dependencies_processed=False)
    with mocker.patch("awx.main.scheduler.TaskManager.start_task"):
+        DependencyManager().schedule()
        TaskManager().schedule()
        pu = p.project_updates.first()
        TaskManager.start_task.assert_called_once_with(pu, controlplane_instance_group, [j1, j2], controlplane_instance_group.instances.all()[0])
@@ -59,6 +59,7 @@ def test_multi_group_with_shared_dependency(instance_factory, controlplane_insta
        pu.status = "successful"
        pu.save()
    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
+        DependencyManager().schedule()
        TaskManager().schedule()

        TaskManager.start_task.assert_any_call(j1, ig1, [], i1)
@@ -69,7 +70,7 @@ def test_multi_group_with_shared_dependency(instance_factory, controlplane_insta
@pytest.mark.django_db
 def test_workflow_job_no_instancegroup(workflow_job_template_factory, controlplane_instance_group, mocker):
    wfjt = workflow_job_template_factory('anicedayforawalk').workflow_job_template
-    wfj = WorkflowJob.objects.create(workflow_job_template=wfjt)
+    wfj = wfjt.create_unified_job()
    wfj.status = "pending"
    wfj.save()
    with mocker.patch("awx.main.scheduler.TaskManager.start_task"):
@@ -85,39 +86,50 @@ def test_overcapacity_blocking_other_groups_unaffected(instance_factory, control
    i1.capacity = 1020
    i1.save()
    i2 = instance_factory("i2")
+    i2.capacity = 1020
+    i2.save()
    ig1 = instance_group_factory("ig1", instances=[i1])
    ig2 = instance_group_factory("ig2", instances=[i2])
-    objects1 = job_template_factory('jt1', organization='org1', project='proj1', inventory='inv1', credential='cred1', jobs=["job_should_start"])
+    objects1 = job_template_factory('jt1', organization='org1', project='proj1', inventory='inv1', credential='cred1')
    objects1.job_template.instance_groups.add(ig1)
-    j1 = objects1.jobs['job_should_start']
-    j1.status = 'pending'
-    j1.save()
-    objects2 = job_template_factory(
-        'jt2', organization=objects1.organization, project='proj2', inventory='inv2', credential='cred2', jobs=["job_should_start", "job_should_also_start"]
-    )
+    j1 = create_job(objects1.job_template)
+    objects2 = job_template_factory('jt2', organization=objects1.organization, project='proj2', inventory='inv2', credential='cred2')
    objects2.job_template.instance_groups.add(ig1)
-    j1_1 = objects2.jobs['job_should_also_start']
-    j1_1.status = 'pending'
-    j1_1.save()
-    objects3 = job_template_factory('jt3', organization='org2', project='proj3', inventory='inv3', credential='cred3', jobs=["job_should_still_start"])
+    j1_1 = create_job(objects2.job_template)
+    objects3 = job_template_factory('jt3', organization='org2', project='proj3', inventory='inv3', credential='cred3')
    objects3.job_template.instance_groups.add(ig2)
-    j2 = objects3.jobs['job_should_still_start']
-    j2.status = 'pending'
-    j2.save()
-    objects4 = job_template_factory(
-        'jt4', organization=objects3.organization, project='proj4', inventory='inv4', credential='cred4', jobs=["job_should_not_start"]
-    )
+    j2 = create_job(objects3.job_template)
+    objects4 = job_template_factory('jt4', organization=objects3.organization, project='proj4', inventory='inv4', credential='cred4')
    objects4.job_template.instance_groups.add(ig2)
-    j2_1 = objects4.jobs['job_should_not_start']
-    j2_1.status = 'pending'
-    j2_1.save()
-    tm = TaskManager()
+    j2_1 = create_job(objects4.job_template)
+
    with mock.patch('awx.main.models.Job.task_impact', new_callable=mock.PropertyMock) as mock_task_impact:
        mock_task_impact.return_value = 500
-        with mock.patch.object(TaskManager, "start_task", wraps=tm.start_task) as mock_job:
-            tm.schedule()
-            mock_job.assert_has_calls([mock.call(j1, ig1, [], i1), mock.call(j1_1, ig1, [], i1), mock.call(j2, ig2, [], i2)])
-            assert mock_job.call_count == 3
+        TaskManager().schedule()
+
+        # all jobs should be able to run, plenty of capacity across both instances
+        for j in [j1, j1_1, j2, j2_1]:
+            j.refresh_from_db()
+            assert j.status == "waiting"
+
+        # reset to pending
+        for j in [j1, j1_1, j2, j2_1]:
+            j.status = "pending"
+            j.save()
+
+        # make i2 can only be able to fit 1 job
+        i2.capacity = 510
+        i2.save()
+
+        TaskManager().schedule()
+
+        for j in [j1, j1_1, j2]:
+            j.refresh_from_db()
+            assert j.status == "waiting"
+
+        j2_1.refresh_from_db()
+        # could not run because i2 is full
+        assert j2_1.status == "pending"


@pytest.mark.django_db
@@ -126,19 +138,13 @@ def test_failover_group_run(instance_factory, controlplane_instance_group, mocke
    i2 = instance_factory("i2")
    ig1 = instance_group_factory("ig1", instances=[i1])
    ig2 = instance_group_factory("ig2", instances=[i2])
-    objects1 = job_template_factory('jt1', organization='org1', project='proj1', inventory='inv1', credential='cred1', jobs=["job_should_start"])
+    objects1 = job_template_factory('jt1', organization='org1', project='proj1', inventory='inv1', credential='cred1')
    objects1.job_template.instance_groups.add(ig1)
-    j1 = objects1.jobs['job_should_start']
-    j1.status = 'pending'
-    j1.save()
-    objects2 = job_template_factory(
-        'jt2', organization=objects1.organization, project='proj2', inventory='inv2', credential='cred2', jobs=["job_should_start", "job_should_also_start"]
-    )
+    j1 = create_job(objects1.job_template)
+    objects2 = job_template_factory('jt2', organization=objects1.organization, project='proj2', inventory='inv2', credential='cred2')
    objects2.job_template.instance_groups.add(ig1)
    objects2.job_template.instance_groups.add(ig2)
-    j1_1 = objects2.jobs['job_should_also_start']
-    j1_1.status = 'pending'
-    j1_1.save()
+    j1_1 = create_job(objects2.job_template)
    tm = TaskManager()
    with mock.patch('awx.main.models.Job.task_impact', new_callable=mock.PropertyMock) as mock_task_impact:
        mock_task_impact.return_value = 500
--- a/awx/main/tests/functional/task_management/test_scheduler.py
+++ b/awx/main/tests/functional/task_management/test_scheduler.py
@@ -3,21 +3,19 @@ from unittest import mock
 import json
 from datetime import timedelta

-from awx.main.scheduler import TaskManager
-from awx.main.scheduler.dependency_graph import DependencyGraph
+from awx.main.scheduler import TaskManager, DependencyManager, WorkflowManager
 from awx.main.utils import encrypt_field
 from awx.main.models import WorkflowJobTemplate, JobTemplate, Job
 from awx.main.models.ha import Instance
+from . import create_job
 from django.conf import settings


@pytest.mark.django_db
 def test_single_job_scheduler_launch(hybrid_instance, controlplane_instance_group, job_template_factory, mocker):
    instance = controlplane_instance_group.instances.all()[0]
-    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred', jobs=["job_should_start"])
-    j = objects.jobs["job_should_start"]
-    j.status = 'pending'
-    j.save()
+    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred')
+    j = create_job(objects.job_template)
    with mocker.patch("awx.main.scheduler.TaskManager.start_task"):
        TaskManager().schedule()
        TaskManager.start_task.assert_called_once_with(j, controlplane_instance_group, [], instance)
@@ -32,10 +30,8 @@ class TestJobLifeCycle:
        expect_commit - list of expected on_commit calls
        If any of these are None, then the assertion is not made.
        """
-        if expect_schedule and len(expect_schedule) > 1:
-            raise RuntimeError('Task manager should reschedule itself one time, at most.')
        with mock.patch('awx.main.models.unified_jobs.UnifiedJob.websocket_emit_status') as mock_channel:
-            with mock.patch('awx.main.utils.common._schedule_task_manager') as tm_sch:
+            with mock.patch('awx.main.utils.common.ScheduleManager._schedule') as tm_sch:
                # Job are ultimately submitted in on_commit hook, but this will not
                # actually run, because it waits until outer transaction, which is the test
                # itself in this case
@@ -56,22 +52,21 @@ class TestJobLifeCycle:
        wj = wfjt.create_unified_job()
        assert wj.workflow_nodes.count() == 2
        wj.signal_start()
-        tm = TaskManager()

        # Transitions workflow job to running
        # needs to re-schedule so it spawns jobs next round
-        self.run_tm(tm, [mock.call('running')], [mock.call()])
+        self.run_tm(TaskManager(), [mock.call('running')])

        # Spawns jobs
        # needs re-schedule to submit jobs next round
-        self.run_tm(tm, [mock.call('pending'), mock.call('pending')], [mock.call()])
+        self.run_tm(WorkflowManager(), [mock.call('pending'), mock.call('pending')])

        assert jt.jobs.count() == 2  # task manager spawned jobs

        # Submits jobs
        # intermission - jobs will run and reschedule TM when finished
-        self.run_tm(tm, [mock.call('waiting'), mock.call('waiting')], [])
-
+        self.run_tm(DependencyManager())  # flip dependencies_processed to True
+        self.run_tm(TaskManager())
        # I am the job runner
        for job in jt.jobs.all():
            job.status = 'successful'
@@ -79,7 +74,7 @@ class TestJobLifeCycle:

        # Finishes workflow
        # no further action is necessary, so rescheduling should not happen
-        self.run_tm(tm, [mock.call('successful')], [])
+        self.run_tm(WorkflowManager(), [mock.call('successful')])

    def test_task_manager_workflow_workflow_rescheduling(self, controlplane_instance_group):
        wfjts = [WorkflowJobTemplate.objects.create(name='foo')]
@@ -90,16 +85,13 @@ class TestJobLifeCycle:

        wj = wfjts[0].create_unified_job()
        wj.signal_start()
-        tm = TaskManager()

-        while wfjts[0].status != 'successful':
-            wfjts[1].refresh_from_db()
-            if wfjts[1].status == 'successful':
-                # final run, no more work to do
-                self.run_tm(tm, expect_schedule=[])
-            else:
-                self.run_tm(tm, expect_schedule=[mock.call()])
+        attempts = 10
+        while wfjts[0].status != 'successful' and attempts > 0:
+            self.run_tm(TaskManager())
+            self.run_tm(WorkflowManager())
            wfjts[0].refresh_from_db()
+            attempts -= 1

    def test_control_and_execution_instance(self, project, system_job_template, job_template, inventory_source, control_instance, execution_instance):
        assert Instance.objects.count() == 2
@@ -113,6 +105,7 @@ class TestJobLifeCycle:
        for uj in all_ujs:
            uj.signal_start()

+        DependencyManager().schedule()
        tm = TaskManager()
        self.run_tm(tm)

@@ -135,6 +128,7 @@ class TestJobLifeCycle:
        for uj in all_ujs:
            uj.signal_start()

+        DependencyManager().schedule()
        # There is only enough control capacity to run one of the jobs so one should end up in pending and the other in waiting
        tm = TaskManager()
        self.run_tm(tm)
@@ -157,6 +151,7 @@ class TestJobLifeCycle:
        for uj in all_ujs:
            uj.signal_start()

+        DependencyManager().schedule()
        # There is only enough control capacity to run one of the jobs so one should end up in pending and the other in waiting
        tm = TaskManager()
        self.run_tm(tm)
@@ -197,63 +192,49 @@ class TestJobLifeCycle:


@pytest.mark.django_db
-def test_single_jt_multi_job_launch_blocks_last(controlplane_instance_group, job_template_factory, mocker):
-    instance = controlplane_instance_group.instances.all()[0]
-    objects = job_template_factory(
-        'jt', organization='org1', project='proj', inventory='inv', credential='cred', jobs=["job_should_start", "job_should_not_start"]
-    )
-    j1 = objects.jobs["job_should_start"]
-    j1.status = 'pending'
+def test_single_jt_multi_job_launch_blocks_last(job_template_factory):
+    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred')
+    j1 = create_job(objects.job_template)
+    j2 = create_job(objects.job_template)
+
+    TaskManager().schedule()
+    j1.refresh_from_db()
+    j2.refresh_from_db()
+    assert j1.status == "waiting"
+    assert j2.status == "pending"
+
+    # mimic running j1 to unblock j2
+    j1.status = "successful"
    j1.save()
-    j2 = objects.jobs["job_should_not_start"]
-    j2.status = 'pending'
-    j2.save()
-    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
-        TaskManager().schedule()
-        TaskManager.start_task.assert_called_once_with(j1, controlplane_instance_group, [], instance)
-        j1.status = "successful"
-        j1.save()
-    with mocker.patch("awx.main.scheduler.TaskManager.start_task"):
-        TaskManager().schedule()
-        TaskManager.start_task.assert_called_once_with(j2, controlplane_instance_group, [], instance)
+    TaskManager().schedule()
+
+    j2.refresh_from_db()
+    assert j2.status == "waiting"


@pytest.mark.django_db
-def test_single_jt_multi_job_launch_allow_simul_allowed(controlplane_instance_group, job_template_factory, mocker):
-    instance = controlplane_instance_group.instances.all()[0]
-    objects = job_template_factory(
-        'jt', organization='org1', project='proj', inventory='inv', credential='cred', jobs=["job_should_start", "job_should_not_start"]
-    )
+def test_single_jt_multi_job_launch_allow_simul_allowed(job_template_factory):
+    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred')
    jt = objects.job_template
+    jt.allow_simultaneous = True
    jt.save()
-
-    j1 = objects.jobs["job_should_start"]
-    j1.allow_simultaneous = True
-    j1.status = 'pending'
-    j1.save()
-    j2 = objects.jobs["job_should_not_start"]
-    j2.allow_simultaneous = True
-    j2.status = 'pending'
-    j2.save()
-    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
-        TaskManager().schedule()
-        TaskManager.start_task.assert_has_calls(
-            [mock.call(j1, controlplane_instance_group, [], instance), mock.call(j2, controlplane_instance_group, [], instance)]
-        )
+    j1 = create_job(objects.job_template)
+    j2 = create_job(objects.job_template)
+    TaskManager().schedule()
+    j1.refresh_from_db()
+    j2.refresh_from_db()
+    assert j1.status == "waiting"
+    assert j2.status == "waiting"


@pytest.mark.django_db
 def test_multi_jt_capacity_blocking(hybrid_instance, job_template_factory, mocker):
    instance = hybrid_instance
    controlplane_instance_group = instance.rampart_groups.first()
-    objects1 = job_template_factory('jt1', organization='org1', project='proj1', inventory='inv1', credential='cred1', jobs=["job_should_start"])
-    objects2 = job_template_factory('jt2', organization='org2', project='proj2', inventory='inv2', credential='cred2', jobs=["job_should_not_start"])
-    j1 = objects1.jobs["job_should_start"]
-    j1.status = 'pending'
-    j1.save()
-    j2 = objects2.jobs["job_should_not_start"]
-    j2.status = 'pending'
-    j2.save()
+    objects1 = job_template_factory('jt1', organization='org1', project='proj1', inventory='inv1', credential='cred1')
+    objects2 = job_template_factory('jt2', organization='org2', project='proj2', inventory='inv2', credential='cred2')
+    j1 = create_job(objects1.job_template)
+    j2 = create_job(objects2.job_template)
    tm = TaskManager()
    with mock.patch('awx.main.models.Job.task_impact', new_callable=mock.PropertyMock) as mock_task_impact:
        mock_task_impact.return_value = 505
@@ -269,11 +250,9 @@ def test_multi_jt_capacity_blocking(hybrid_instance, job_template_factory, mocke

@pytest.mark.django_db
 def test_single_job_dependencies_project_launch(controlplane_instance_group, job_template_factory, mocker):
-    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred', jobs=["job_should_start"])
+    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred')
    instance = controlplane_instance_group.instances.all()[0]
-    j = objects.jobs["job_should_start"]
-    j.status = 'pending'
-    j.save()
+    j = create_job(objects.job_template, dependencies_processed=False)
    p = objects.project
    p.scm_update_on_launch = True
    p.scm_update_cache_timeout = 0
@@ -281,12 +260,13 @@ def test_single_job_dependencies_project_launch(controlplane_instance_group, job
    p.scm_url = "http://github.com/ansible/ansible.git"
    p.save(skip_update=True)
    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
-        tm = TaskManager()
-        with mock.patch.object(TaskManager, "create_project_update", wraps=tm.create_project_update) as mock_pu:
-            tm.schedule()
+        dm = DependencyManager()
+        with mock.patch.object(DependencyManager, "create_project_update", wraps=dm.create_project_update) as mock_pu:
+            dm.schedule()
            mock_pu.assert_called_once_with(j)
            pu = [x for x in p.project_updates.all()]
            assert len(pu) == 1
+            TaskManager().schedule()
            TaskManager.start_task.assert_called_once_with(pu[0], controlplane_instance_group, [j], instance)
            pu[0].status = "successful"
            pu[0].save()
@@ -297,11 +277,9 @@ def test_single_job_dependencies_project_launch(controlplane_instance_group, job

@pytest.mark.django_db
 def test_single_job_dependencies_inventory_update_launch(controlplane_instance_group, job_template_factory, mocker, inventory_source_factory):
-    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred', jobs=["job_should_start"])
+    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred')
    instance = controlplane_instance_group.instances.all()[0]
-    j = objects.jobs["job_should_start"]
-    j.status = 'pending'
-    j.save()
+    j = create_job(objects.job_template, dependencies_processed=False)
    i = objects.inventory
    ii = inventory_source_factory("ec2")
    ii.source = "ec2"
@@ -310,12 +288,13 @@ def test_single_job_dependencies_inventory_update_launch(controlplane_instance_g
    ii.save()
    i.inventory_sources.add(ii)
    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
-        tm = TaskManager()
-        with mock.patch.object(TaskManager, "create_inventory_update", wraps=tm.create_inventory_update) as mock_iu:
-            tm.schedule()
+        dm = DependencyManager()
+        with mock.patch.object(DependencyManager, "create_inventory_update", wraps=dm.create_inventory_update) as mock_iu:
+            dm.schedule()
            mock_iu.assert_called_once_with(j, ii)
            iu = [x for x in ii.inventory_updates.all()]
            assert len(iu) == 1
+            TaskManager().schedule()
            TaskManager.start_task.assert_called_once_with(iu[0], controlplane_instance_group, [j], instance)
            iu[0].status = "successful"
            iu[0].save()
@@ -334,19 +313,17 @@ def test_inventory_update_launches_project_update(controlplane_instance_group, s
    iu.status = "pending"
    iu.save()
    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
-        tm = TaskManager()
-        with mock.patch.object(TaskManager, "create_project_update", wraps=tm.create_project_update) as mock_pu:
-            tm.schedule()
+        dm = DependencyManager()
+        with mock.patch.object(DependencyManager, "create_project_update", wraps=dm.create_project_update) as mock_pu:
+            dm.schedule()
            mock_pu.assert_called_with(iu, project_id=project.id)


@pytest.mark.django_db
 def test_job_dependency_with_already_updated(controlplane_instance_group, job_template_factory, mocker, inventory_source_factory):
-    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred', jobs=["job_should_start"])
+    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred')
    instance = controlplane_instance_group.instances.all()[0]
-    j = objects.jobs["job_should_start"]
-    j.status = 'pending'
-    j.save()
+    j = create_job(objects.job_template, dependencies_processed=False)
    i = objects.inventory
    ii = inventory_source_factory("ec2")
    ii.source = "ec2"
@@ -359,9 +336,9 @@ def test_job_dependency_with_already_updated(controlplane_instance_group, job_te
    j.start_args = encrypt_field(j, field_name="start_args")
    j.save()
    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
-        tm = TaskManager()
-        with mock.patch.object(TaskManager, "create_inventory_update", wraps=tm.create_inventory_update) as mock_iu:
-            tm.schedule()
+        dm = DependencyManager()
+        with mock.patch.object(DependencyManager, "create_inventory_update", wraps=dm.create_inventory_update) as mock_iu:
+            dm.schedule()
            mock_iu.assert_not_called()
    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
        TaskManager().schedule()
@@ -371,13 +348,11 @@ def test_job_dependency_with_already_updated(controlplane_instance_group, job_te
@pytest.mark.django_db
 def test_shared_dependencies_launch(controlplane_instance_group, job_template_factory, mocker, inventory_source_factory):
    instance = controlplane_instance_group.instances.all()[0]
-    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred', jobs=["first_job", "second_job"])
-    j1 = objects.jobs["first_job"]
-    j1.status = 'pending'
-    j1.save()
-    j2 = objects.jobs["second_job"]
-    j2.status = 'pending'
-    j2.save()
+    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred')
+    objects.job_template.allow_simultaneous = True
+    objects.job_template.save()
+    j1 = create_job(objects.job_template, dependencies_processed=False)
+    j2 = create_job(objects.job_template, dependencies_processed=False)
    p = objects.project
    p.scm_update_on_launch = True
    p.scm_update_cache_timeout = 300
@@ -392,8 +367,8 @@ def test_shared_dependencies_launch(controlplane_instance_group, job_template_fa
    ii.update_cache_timeout = 300
    ii.save()
    i.inventory_sources.add(ii)
-
    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
+        DependencyManager().schedule()
        TaskManager().schedule()
        pu = p.project_updates.first()
        iu = ii.inventory_updates.first()
@@ -408,12 +383,9 @@ def test_shared_dependencies_launch(controlplane_instance_group, job_template_fa
        iu.save()
    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
        TaskManager().schedule()
-        TaskManager.start_task.assert_called_once_with(j1, controlplane_instance_group, [], instance)
-        j1.status = "successful"
-        j1.save()
-    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
-        TaskManager().schedule()
-        TaskManager.start_task.assert_called_once_with(j2, controlplane_instance_group, [], instance)
+        TaskManager.start_task.assert_has_calls(
+            [mock.call(j1, controlplane_instance_group, [], instance), mock.call(j2, controlplane_instance_group, [], instance)]
+        )
    pu = [x for x in p.project_updates.all()]
    iu = [x for x in ii.inventory_updates.all()]
    assert len(pu) == 1
@@ -422,30 +394,27 @@ def test_shared_dependencies_launch(controlplane_instance_group, job_template_fa

@pytest.mark.django_db
 def test_job_not_blocking_project_update(controlplane_instance_group, job_template_factory):
-    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred', jobs=["job"])
-    job = objects.jobs["job"]
+    instance = controlplane_instance_group.instances.all()[0]
+    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred')
+    job = objects.job_template.create_unified_job()
    job.instance_group = controlplane_instance_group
+    job.dependencies_process = True
    job.status = "running"
    job.save()

    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
-        task_manager = TaskManager()
-        task_manager._schedule()
-
        proj = objects.project
        project_update = proj.create_project_update()
        project_update.instance_group = controlplane_instance_group
        project_update.status = "pending"
        project_update.save()
-        assert not task_manager.job_blocked_by(project_update)
-
-        dependency_graph = DependencyGraph()
-        dependency_graph.add_job(job)
-        assert not dependency_graph.task_blocked_by(project_update)
+        TaskManager().schedule()
+        TaskManager.start_task.assert_called_once_with(project_update, controlplane_instance_group, [], instance)


@pytest.mark.django_db
 def test_job_not_blocking_inventory_update(controlplane_instance_group, job_template_factory, inventory_source_factory):
+    instance = controlplane_instance_group.instances.all()[0]
    objects = job_template_factory('jt', organization='org1', project='proj', inventory='inv', credential='cred', jobs=["job"])
    job = objects.jobs["job"]
    job.instance_group = controlplane_instance_group
@@ -453,9 +422,6 @@ def test_job_not_blocking_inventory_update(controlplane_instance_group, job_temp
    job.save()

    with mock.patch("awx.main.scheduler.TaskManager.start_task"):
-        task_manager = TaskManager()
-        task_manager._schedule()
-
        inv = objects.inventory
        inv_source = inventory_source_factory("ec2")
        inv_source.source = "ec2"
@@ -465,11 +431,9 @@ def test_job_not_blocking_inventory_update(controlplane_instance_group, job_temp
        inventory_update.status = "pending"
        inventory_update.save()

-        assert not task_manager.job_blocked_by(inventory_update)
-
-        dependency_graph = DependencyGraph()
-        dependency_graph.add_job(job)
-        assert not dependency_graph.task_blocked_by(inventory_update)
+        DependencyManager().schedule()
+        TaskManager().schedule()
+        TaskManager.start_task.assert_called_once_with(inventory_update, controlplane_instance_group, [], instance)


@pytest.mark.django_db
@@ -484,7 +448,7 @@ def test_generate_dependencies_only_once(job_template_factory):
        # job starts with dependencies_processed as False
        assert not job.dependencies_processed
        # run one cycle of ._schedule() to generate dependencies
-        TaskManager()._schedule()
+        DependencyManager().schedule()

        # make sure dependencies_processed is now True
        job = Job.objects.filter(name="job_gen_dep")[0]
@@ -492,7 +456,7 @@ def test_generate_dependencies_only_once(job_template_factory):

        # Run ._schedule() again, but make sure .generate_dependencies() is not
        # called with job in the argument list
-        tm = TaskManager()
-        tm.generate_dependencies = mock.MagicMock(return_value=[])
-        tm._schedule()
-        tm.generate_dependencies.assert_has_calls([mock.call([]), mock.call([])])
+        dm = DependencyManager()
+        dm.generate_dependencies = mock.MagicMock(return_value=[])
+        dm.schedule()
+        dm.generate_dependencies.assert_not_called()
--- a/awx/main/tests/functional/test_credential.py
+++ b/awx/main/tests/functional/test_credential.py
@@ -74,34 +74,37 @@ GLqbpJyX2r3p/Rmo6mLY71SqpA==

@pytest.mark.django_db
 def test_default_cred_types():
-    assert sorted(CredentialType.defaults.keys()) == [
-        'aim',
-        'aws',
-        'azure_kv',
-        'azure_rm',
-        'centrify_vault_kv',
-        'conjur',
-        'controller',
-        'galaxy_api_token',
-        'gce',
-        'github_token',
-        'gitlab_token',
-        'hashivault_kv',
-        'hashivault_ssh',
-        'insights',
-        'kubernetes_bearer_token',
-        'net',
-        'openstack',
-        'registry',
-        'rhv',
-        'satellite6',
-        'scm',
-        'ssh',
-        'thycotic_dsv',
-        'thycotic_tss',
-        'vault',
-        'vmware',
-    ]
+    assert sorted(CredentialType.defaults.keys()) == sorted(
+        [
+            'aim',
+            'aws',
+            'azure_kv',
+            'azure_rm',
+            'centrify_vault_kv',
+            'conjur',
+            'controller',
+            'galaxy_api_token',
+            'gce',
+            'github_token',
+            'gitlab_token',
+            'gpg_public_key',
+            'hashivault_kv',
+            'hashivault_ssh',
+            'insights',
+            'kubernetes_bearer_token',
+            'net',
+            'openstack',
+            'registry',
+            'rhv',
+            'satellite6',
+            'scm',
+            'ssh',
+            'thycotic_dsv',
+            'thycotic_tss',
+            'vault',
+            'vmware',
+        ]
+    )

    for type_ in CredentialType.defaults.values():
        assert type_().managed is True
--- a/awx/main/tests/functional/test_dispatch.py
+++ b/awx/main/tests/functional/test_dispatch.py
@@ -199,9 +199,7 @@ class TestAutoScaling:
        assert len(self.pool) == 10

        # cleanup should scale down to 8 workers
-        with mock.patch('awx.main.dispatch.reaper.reap') as reap:
-            self.pool.cleanup()
-        reap.assert_called()
+        self.pool.cleanup()
        assert len(self.pool) == 2

    def test_max_scale_up(self):
@@ -246,12 +244,10 @@ class TestAutoScaling:
        assert not self.pool.should_grow
        alive_pid = self.pool.workers[1].pid
        self.pool.workers[0].process.terminate()
-        time.sleep(1)  # wait a moment for sigterm
+        time.sleep(2)  # wait a moment for sigterm

        # clean up and the dead worker
-        with mock.patch('awx.main.dispatch.reaper.reap') as reap:
-            self.pool.cleanup()
-        reap.assert_called()
+        self.pool.cleanup()
        assert len(self.pool) == 1
        assert self.pool.workers[0].pid == alive_pid

@@ -353,7 +349,7 @@ class TestJobReaper(object):
            ('waiting', '', '', None, False),  # waiting, not assigned to the instance
            ('waiting', 'awx', '', None, False),  # waiting, was edited less than a minute ago
            ('waiting', '', 'awx', None, False),  # waiting, was edited less than a minute ago
-            ('waiting', 'awx', '', yesterday, True),  # waiting, assigned to the execution_node, stale
+            ('waiting', 'awx', '', yesterday, False),  # waiting, managed by another node, ignore
            ('waiting', '', 'awx', yesterday, True),  # waiting, assigned to the controller_node, stale
        ],
    )
@@ -372,6 +368,7 @@ class TestJobReaper(object):
            # (because .save() overwrites it to _now_)
            Job.objects.filter(id=j.id).update(modified=modified)
        reaper.reap(i)
+        reaper.reap_waiting(i)
        job = Job.objects.first()
        if fail:
            assert job.status == 'failed'
--- a/awx/main/tests/functional/test_inventory_source_injectors.py
+++ b/awx/main/tests/functional/test_inventory_source_injectors.py
@@ -261,5 +261,6 @@ def test_inventory_update_injected_content(this_kind, inventory, fake_credential
        with mock.patch.object(UnifiedJob, 'websocket_emit_status', mock.Mock()):
            # The point of this test is that we replace run with assertions
            with mock.patch('awx.main.tasks.receptor.AWXReceptorJob.run', substitute_run):
-                # so this sets up everything for a run and then yields control over to substitute_run
-                task.run(inventory_update.pk)
+                with mock.patch('awx.main.tasks.jobs.create_partition'):
+                    # so this sets up everything for a run and then yields control over to substitute_run
+                    task.run(inventory_update.pk)
--- a/awx/main/tests/functional/test_tasks.py
+++ b/awx/main/tests/functional/test_tasks.py
@@ -4,6 +4,7 @@ import os
 import tempfile
 import shutil

+from awx.main.tasks.jobs import RunJob
 from awx.main.tasks.system import execution_node_health_check, _cleanup_images_and_files
 from awx.main.models import Instance, Job

@@ -61,3 +62,16 @@ def test_folder_cleanup_running_job(mock_job_folder, mock_me):
        job.save(update_fields=['status'])
        _cleanup_images_and_files(grace_period=0)
        assert not os.path.exists(mock_job_folder)  # job is finished and no grace period, should delete
+
+
+@pytest.mark.django_db
+def test_does_not_run_reaped_job(mocker, mock_me):
+    job = Job.objects.create(status='failed', job_explanation='This job has been reaped.')
+    mock_run = mocker.patch('awx.main.tasks.jobs.ansible_runner.interface.run')
+    try:
+        RunJob().run(job.id)
+    except Exception:
+        pass
+    job.refresh_from_db()
+    assert job.status == 'failed'
+    mock_run.assert_not_called()
--- a/awx/main/tests/functional/utils/test_update_model.py
+++ b/awx/main/tests/functional/utils/test_update_model.py
@@ -0,0 +1,47 @@
+import pytest
+
+from django.db import DatabaseError
+
+from awx.main.models.jobs import Job
+from awx.main.utils.update_model import update_model
+
+
+@pytest.fixture
+def normal_job(deploy_jobtemplate):
+    return deploy_jobtemplate.create_unified_job()
+
+
+class NewException(Exception):
+    pass
+
+
+@pytest.mark.django_db
+def test_normal_get(normal_job):
+    mod_job = Job.objects.get(pk=normal_job.id)
+    mod_job.job_explanation = 'foobar'
+    mod_job.save(update_fields=['job_explanation'])
+    new_job = update_model(Job, normal_job.pk)
+    assert new_job.job_explanation == 'foobar'
+
+
+@pytest.mark.django_db
+def test_exception(normal_job, mocker):
+    mocker.patch.object(Job.objects, 'get', side_effect=DatabaseError)
+    mocker.patch('awx.main.utils.update_model.time.sleep')
+    with pytest.raises(DatabaseError):
+        update_model(Job, normal_job.pk)
+
+
+@pytest.mark.django_db
+def test_unknown_exception(normal_job, mocker):
+    mocker.patch.object(Job.objects, 'get', side_effect=NewException)
+    mocker.patch('awx.main.utils.update_model.time.sleep')
+    with pytest.raises(NewException):
+        update_model(Job, normal_job.pk)
+
+
+@pytest.mark.django_db
+def test_deleted_job(normal_job):
+    job_pk = normal_job.pk
+    normal_job.delete()
+    assert update_model(Job, job_pk) is None
--- a/awx/main/tests/unit/models/test_jobs.py
+++ b/awx/main/tests/unit/models/test_jobs.py
@@ -90,7 +90,7 @@ def test_finish_job_fact_cache_with_existing_data(job, hosts, inventory, mocker,
        assert host.ansible_facts == {"a": 1, "b": 2}
        assert host.ansible_facts_modified is None
    assert hosts[1].ansible_facts == ansible_facts_new
-    hosts[1].save.assert_called_once_with()
+    hosts[1].save.assert_called_once_with(update_fields=['ansible_facts', 'ansible_facts_modified'])


 def test_finish_job_fact_cache_with_bad_data(job, hosts, inventory, mocker, tmpdir):
--- a/awx/main/tests/unit/models/test_unified_job_unit.py
+++ b/awx/main/tests/unit/models/test_unified_job_unit.py
@@ -22,6 +22,10 @@ def test_unified_job_workflow_attributes():
        assert job.workflow_job_id == 1


+def mock_on_commit(f):
+    f()
+
+
@pytest.fixture
 def unified_job(mocker):
    mocker.patch.object(UnifiedJob, 'can_cancel', return_value=True)
@@ -30,12 +34,14 @@ def unified_job(mocker):
    j.cancel_flag = None
    j.save = mocker.MagicMock()
    j.websocket_emit_status = mocker.MagicMock()
+    j.fallback_cancel = mocker.MagicMock()
    return j


 def test_cancel(unified_job):

-    unified_job.cancel()
+    with mock.patch('awx.main.models.unified_jobs.connection.on_commit', wraps=mock_on_commit):
+        unified_job.cancel()

    assert unified_job.cancel_flag is True
    assert unified_job.status == 'canceled'
@@ -50,10 +56,11 @@ def test_cancel(unified_job):
 def test_cancel_job_explanation(unified_job):
    job_explanation = 'giggity giggity'

-    unified_job.cancel(job_explanation=job_explanation)
+    with mock.patch('awx.main.models.unified_jobs.connection.on_commit'):
+        unified_job.cancel(job_explanation=job_explanation)

    assert unified_job.job_explanation == job_explanation
-    unified_job.save.assert_called_with(update_fields=['cancel_flag', 'start_args', 'status', 'job_explanation'])
+    unified_job.save.assert_called_with(update_fields=['cancel_flag', 'start_args', 'job_explanation', 'status'])


 def test_organization_copy_to_jobs():
--- a/awx/main/tests/unit/test_tasks.py
+++ b/awx/main/tests/unit/test_tasks.py
@@ -34,7 +34,7 @@ from awx.main.models import (
 )
 from awx.main.models.credential import HIDDEN_PASSWORD, ManagedCredentialType

-from awx.main.tasks import jobs, system
+from awx.main.tasks import jobs, system, receptor
 from awx.main.utils import encrypt_field, encrypt_value
 from awx.main.utils.safe_yaml import SafeLoader
 from awx.main.utils.execution_environments import CONTAINER_ROOT
@@ -42,6 +42,8 @@ from awx.main.utils.execution_environments import CONTAINER_ROOT
 from awx.main.utils.licensing import Licenser
 from awx.main.constants import JOB_VARIABLE_PREFIXES

+from receptorctl.socket_interface import ReceptorControl
+

 def to_host_path(path, private_data_dir):
    """Given a path inside of the EE container, this gives the absolute path
@@ -78,6 +80,12 @@ def patch_Job():
            yield


+@pytest.fixture
+def mock_create_partition():
+    with mock.patch('awx.main.tasks.jobs.create_partition') as cp_mock:
+        yield cp_mock
+
+
@pytest.fixture
 def patch_Organization():
    _credentials = []
@@ -461,7 +469,7 @@ class TestExtraVarSanitation(TestJobExecution):


 class TestGenericRun:
-    def test_generic_failure(self, patch_Job, execution_environment, mock_me):
+    def test_generic_failure(self, patch_Job, execution_environment, mock_me, mock_create_partition):
        job = Job(status='running', inventory=Inventory(), project=Project(local_path='/projects/_23_foo'))
        job.websocket_emit_status = mock.Mock()
        job.execution_environment = execution_environment
@@ -472,7 +480,7 @@ class TestGenericRun:
        task.model.objects.get = mock.Mock(return_value=job)
        task.build_private_data_files = mock.Mock(side_effect=OSError())

-        with mock.patch('awx.main.tasks.jobs.copy_tree'):
+        with mock.patch('awx.main.tasks.jobs.shutil.copytree'):
            with pytest.raises(Exception):
                task.run(1)

@@ -481,7 +489,7 @@ class TestGenericRun:
        assert update_model_call['status'] == 'error'
        assert update_model_call['emitted_events'] == 0

-    def test_cancel_flag(self, job, update_model_wrapper, execution_environment, mock_me):
+    def test_cancel_flag(self, job, update_model_wrapper, execution_environment, mock_me, mock_create_partition):
        job.status = 'running'
        job.cancel_flag = True
        job.websocket_emit_status = mock.Mock()
@@ -494,11 +502,11 @@ class TestGenericRun:
        task.model.objects.get = mock.Mock(return_value=job)
        task.build_private_data_files = mock.Mock()

-        with mock.patch('awx.main.tasks.jobs.copy_tree'):
+        with mock.patch('awx.main.tasks.jobs.shutil.copytree'):
            with pytest.raises(Exception):
                task.run(1)

-        for c in [mock.call(1, status='running', start_args=''), mock.call(1, status='canceled')]:
+        for c in [mock.call(1, start_args='', status='canceled')]:
            assert c in task.update_model.call_args_list

    def test_event_count(self, mock_me):
@@ -580,7 +588,7 @@ class TestGenericRun:

@pytest.mark.django_db
 class TestAdhocRun(TestJobExecution):
-    def test_options_jinja_usage(self, adhoc_job, adhoc_update_model_wrapper, mock_me):
+    def test_options_jinja_usage(self, adhoc_job, adhoc_update_model_wrapper, mock_me, mock_create_partition):
        ExecutionEnvironment.objects.create(name='Control Plane EE', managed=True)
        ExecutionEnvironment.objects.create(name='Default Job EE', managed=False)

@@ -1934,7 +1942,7 @@ def test_managed_injector_redaction(injector_cls):
    assert 'very_secret_value' not in str(build_safe_env(env))


-def test_job_run_no_ee(mock_me):
+def test_job_run_no_ee(mock_me, mock_create_partition):
    org = Organization(pk=1)
    proj = Project(pk=1, organization=org)
    job = Job(project=proj, organization=org, inventory=Inventory(pk=1))
@@ -1944,7 +1952,7 @@ def test_job_run_no_ee(mock_me):
    task.update_model = mock.Mock(return_value=job)
    task.model.objects.get = mock.Mock(return_value=job)

-    with mock.patch('awx.main.tasks.jobs.copy_tree'):
+    with mock.patch('awx.main.tasks.jobs.shutil.copytree'):
        with pytest.raises(RuntimeError) as e:
            task.pre_run_hook(job, private_data_dir)

@@ -1965,3 +1973,120 @@ def test_project_update_no_ee(mock_me):
        task.build_env(job, {})

    assert 'The project could not sync because there is no Execution Environment' in str(e.value)
+
+
+@pytest.mark.parametrize(
+    'work_unit_data, expected_function_call',
+    [
+        [
+            # if (extra_data is None): continue
+            {
+                'zpdFi4BX': {
+                    'ExtraData': None,
+                }
+            },
+            False,
+        ],
+        [
+            # Extra data is a string and StateName is None
+            {
+                "y4NgMKKW": {
+                    "ExtraData": "Unknown WorkType",
+                }
+            },
+            False,
+        ],
+        [
+            # Extra data is a string and StateName in RECEPTOR_ACTIVE_STATES
+            {
+                "y4NgMKKW": {
+                    "ExtraData": "Unknown WorkType",
+                    "StateName": "Running",
+                }
+            },
+            False,
+        ],
+        [
+            # Extra data is a string and StateName not in RECEPTOR_ACTIVE_STATES
+            {
+                "y4NgMKKW": {
+                    "ExtraData": "Unknown WorkType",
+                    "StateName": "Succeeded",
+                }
+            },
+            True,
+        ],
+        [
+            # Extra data is a dict but RemoteWorkType is not ansible-runner
+            {
+                "y4NgMKKW": {
+                    'ExtraData': {
+                        'RemoteWorkType': 'not-ansible-runner',
+                    },
+                }
+            },
+            False,
+        ],
+        [
+            # Extra data is a dict and its an ansible-runner but we have no params
+            {
+                'zpdFi4BX': {
+                    'ExtraData': {
+                        'RemoteWorkType': 'ansible-runner',
+                    },
+                }
+            },
+            False,
+        ],
+        [
+            # Extra data is a dict and its an ansible-runner but params is not --worker-info
+            {
+                'zpdFi4BX': {
+                    'ExtraData': {'RemoteWorkType': 'ansible-runner', 'RemoteParams': {'params': '--not-worker-info'}},
+                }
+            },
+            False,
+        ],
+        [
+            # Extra data is a dict and its an ansible-runner but params starts without cleanup
+            {
+                'zpdFi4BX': {
+                    'ExtraData': {'RemoteWorkType': 'ansible-runner', 'RemoteParams': {'params': 'not cleanup stuff'}},
+                }
+            },
+            False,
+        ],
+        [
+            # Extra data is a dict and its an ansible-runner w/ params but still running
+            {
+                'zpdFi4BX': {
+                    'ExtraData': {'RemoteWorkType': 'ansible-runner', 'RemoteParams': {'params': '--worker-info'}},
+                    "StateName": "Running",
+                }
+            },
+            False,
+        ],
+        [
+            # Extra data is a dict and its an ansible-runner w/ params and completed
+            {
+                'zpdFi4BX': {
+                    'ExtraData': {'RemoteWorkType': 'ansible-runner', 'RemoteParams': {'params': '--worker-info'}},
+                    "StateName": "Succeeded",
+                }
+            },
+            True,
+        ],
+    ],
+)
+def test_administrative_workunit_reaper(work_unit_data, expected_function_call):
+    # Mock the get_receptor_ctl call and let it return a dummy object
+    # It does not matter what file name we return as the socket because we won't actually call receptor (unless something is broken)
+    with mock.patch('awx.main.tasks.receptor.get_receptor_ctl') as mock_get_receptor_ctl:
+        mock_get_receptor_ctl.return_value = ReceptorControl('/var/run/awx-receptor/receptor.sock')
+        with mock.patch('receptorctl.socket_interface.ReceptorControl.simple_command') as simple_command:
+            receptor.administrative_workunit_reaper(work_list=work_unit_data)
+
+    if expected_function_call:
+        simple_command.assert_called()
+    else:
+        simple_command.assert_not_called()
--- a/awx/main/utils/common.py
+++ b/awx/main/utils/common.py
@@ -6,6 +6,7 @@ from datetime import timedelta
 import json
 import yaml
 import logging
+import time
 import os
 import subprocess
 import re
@@ -78,8 +79,9 @@ __all__ = [
    'IllegalArgumentError',
    'get_custom_venv_choices',
    'get_external_account',
-    'task_manager_bulk_reschedule',
-    'schedule_task_manager',
+    'ScheduleTaskManager',
+    'ScheduleDependencyManager',
+    'ScheduleWorkflowManager',
    'classproperty',
    'create_temporary_fifo',
    'truncate_stdout',
@@ -846,6 +848,66 @@ def get_mem_effective_capacity(mem_bytes):

 _inventory_updates = threading.local()
 _task_manager = threading.local()
+_dependency_manager = threading.local()
+_workflow_manager = threading.local()
+
+
+@contextlib.contextmanager
+def task_manager_bulk_reschedule():
+    managers = [ScheduleTaskManager(), ScheduleWorkflowManager(), ScheduleDependencyManager()]
+    """Context manager to avoid submitting task multiple times."""
+    try:
+        for m in managers:
+            m.previous_flag = getattr(m.manager_threading_local, 'bulk_reschedule', False)
+            m.previous_value = getattr(m.manager_threading_local, 'needs_scheduling', False)
+            m.manager_threading_local.bulk_reschedule = True
+            m.manager_threading_local.needs_scheduling = False
+        yield
+    finally:
+        for m in managers:
+            m.manager_threading_local.bulk_reschedule = m.previous_flag
+            if m.manager_threading_local.needs_scheduling:
+                m.schedule()
+            m.manager_threading_local.needs_scheduling = m.previous_value
+
+
+class ScheduleManager:
+    def __init__(self, manager, manager_threading_local):
+        self.manager = manager
+        self.manager_threading_local = manager_threading_local
+
+    def _schedule(self):
+        from django.db import connection
+
+        # runs right away if not in transaction
+        connection.on_commit(lambda: self.manager.delay())
+
+    def schedule(self):
+        if getattr(self.manager_threading_local, 'bulk_reschedule', False):
+            self.manager_threading_local.needs_scheduling = True
+            return
+        self._schedule()
+
+
+class ScheduleTaskManager(ScheduleManager):
+    def __init__(self):
+        from awx.main.scheduler.tasks import task_manager
+
+        super().__init__(task_manager, _task_manager)
+
+
+class ScheduleDependencyManager(ScheduleManager):
+    def __init__(self):
+        from awx.main.scheduler.tasks import dependency_manager
+
+        super().__init__(dependency_manager, _dependency_manager)
+
+
+class ScheduleWorkflowManager(ScheduleManager):
+    def __init__(self):
+        from awx.main.scheduler.tasks import workflow_manager
+
+        super().__init__(workflow_manager, _workflow_manager)


@contextlib.contextmanager
@@ -861,37 +923,6 @@ def ignore_inventory_computed_fields():
        _inventory_updates.is_updating = previous_value


-def _schedule_task_manager():
-    from awx.main.scheduler.tasks import run_task_manager
-    from django.db import connection
-
-    # runs right away if not in transaction
-    connection.on_commit(lambda: run_task_manager.delay())
-
-
-@contextlib.contextmanager
-def task_manager_bulk_reschedule():
-    """Context manager to avoid submitting task multiple times."""
-    try:
-        previous_flag = getattr(_task_manager, 'bulk_reschedule', False)
-        previous_value = getattr(_task_manager, 'needs_scheduling', False)
-        _task_manager.bulk_reschedule = True
-        _task_manager.needs_scheduling = False
-        yield
-    finally:
-        _task_manager.bulk_reschedule = previous_flag
-        if _task_manager.needs_scheduling:
-            _schedule_task_manager()
-        _task_manager.needs_scheduling = previous_value
-
-
-def schedule_task_manager():
-    if getattr(_task_manager, 'bulk_reschedule', False):
-        _task_manager.needs_scheduling = True
-        return
-    _schedule_task_manager()
-
-
@contextlib.contextmanager
 def ignore_inventory_group_removal():
    """
@@ -1153,3 +1184,19 @@ def cleanup_new_process(func):
        return func(*args, **kwargs)

    return wrapper_cleanup_new_process
+
+
+def log_excess_runtime(func_logger, cutoff=5.0):
+    def log_excess_runtime_decorator(func):
+        @wraps(func)
+        def _new_func(*args, **kwargs):
+            start_time = time.time()
+            return_value = func(*args, **kwargs)
+            delta = time.time() - start_time
+            if delta > cutoff:
+                logger.info(f'Running {func.__name__!r} took {delta:.2f}s')
+            return return_value
+
+        return _new_func
+
+    return log_excess_runtime_decorator
--- a/awx/main/utils/handlers.py
+++ b/awx/main/utils/handlers.py
@@ -76,7 +76,7 @@ class SpecialInventoryHandler(logging.Handler):
    def emit(self, record):
        # check cancel and timeout status regardless of log level
        this_time = now()
-        if (this_time - self.last_check).total_seconds() > 0.5:  # cancel callback is expensive
+        if (this_time - self.last_check).total_seconds() > 0.1:
            self.last_check = this_time
            if self.cancel_callback():
                raise PostRunError('Inventory update has been canceled', status='canceled')
--- a/awx/main/utils/update_model.py
+++ b/awx/main/utils/update_model.py
@@ -1,4 +1,5 @@
 from django.db import transaction, DatabaseError, InterfaceError
+from django.core.exceptions import ObjectDoesNotExist

 import logging
 import time
@@ -32,6 +33,8 @@ def update_model(model, pk, _attempt=0, _max_attempts=5, select_for_update=False
                        update_fields.append('failed')
                instance.save(update_fields=update_fields)
            return instance
+    except ObjectDoesNotExist:
+        return None
    except (DatabaseError, InterfaceError) as e:
        # Log out the error to the debug logger.
        logger.debug('Database error updating %s, retrying in 5 seconds (retry #%d): %s', model._meta.object_name, _attempt + 1, e)
@@ -45,4 +48,5 @@ def update_model(model, pk, _attempt=0, _max_attempts=5, select_for_update=False
                    raise RuntimeError(f'Could not fetch {pk} because of receiving abort signal')
            return update_model(model, pk, _attempt=_attempt + 1, _max_attempts=_max_attempts, **updates)
        else:
-            logger.error('Failed to update %s after %d retries.', model._meta.object_name, _attempt)
+            logger.warning(f'Failed to update {model._meta.object_name} pk={pk} after {_attempt} retries.')
+            raise
--- a/awx/playbooks/action_plugins/playbook_integrity.py
+++ b/awx/playbooks/action_plugins/playbook_integrity.py
@@ -0,0 +1,115 @@
+from __future__ import absolute_import, division, print_function
+
+__metaclass__ = type
+
+import gnupg
+import os
+import tempfile
+from ansible.module_utils.basic import *
+from ansible.plugins.action import ActionBase
+from ansible.utils.display import Display
+
+from ansible_sign.checksum import (
+    ChecksumFile,
+    ChecksumMismatch,
+    InvalidChecksumLine,
+)
+from ansible_sign.checksum.differ import DistlibManifestChecksumFileExistenceDiffer
+from ansible_sign.signing import *
+
+display = Display()
+
+
+VALIDATION_TYPES = (
+    "checksum_manifest",
+    "gpg",
+)
+
+
+class ActionModule(ActionBase):
+    def run(self, tmp=None, task_vars=None):
+        self._supports_check_mode = False
+
+        super(ActionModule, self).run(tmp, task_vars)
+
+        self.params = self._task.args
+
+        self.project_path = self.params.get("project_path")
+        if self.project_path is None:
+            return {
+                "failed": True,
+                "msg": "No project path (project_path) was supplied.",
+            }
+
+        validation_type = self.params.get("validation_type")
+        if validation_type is None or validation_type not in VALIDATION_TYPES:
+            return {"failed": True, "msg": "validation_type must be one of: " + ', '.join(VALIDATION_TYPES)}
+        validation_method = getattr(self, f"validate_{validation_type}")
+        return validation_method()
+
+    def validate_gpg(self):
+        gpg_pubkey = self.params.get("gpg_pubkey")
+        if gpg_pubkey is None:
+            return {
+                "failed": True,
+                "msg": "No GPG public key (gpg_pubkey) was supplied.",
+            }
+
+        signature_file = os.path.join(self.project_path, ".ansible-sign", "sha256sum.txt.sig")
+        manifest_file = os.path.join(self.project_path, ".ansible-sign", "sha256sum.txt")
+
+        for path in (signature_file, manifest_file):
+            if not os.path.exists(path):
+                return {
+                    "failed": True,
+                    "msg": f"Expected file not found: {path}",
+                }
+
+        with tempfile.TemporaryDirectory() as gpg_home:
+            gpg = gnupg.GPG(gnupghome=gpg_home)
+            gpg.import_keys(gpg_pubkey)
+            verifier = GPGVerifier(
+                manifest_path=manifest_file,
+                detached_signature_path=signature_file,
+                gpg_home=gpg_home,
+            )
+            result = verifier.verify()
+
+        return {
+            "failed": not result.success,
+            "msg": result.summary,
+            "gpg_details": result.extra_information,
+        }
+
+    def validate_checksum_manifest(self):
+        checksum = ChecksumFile(self.project_path, differ=DistlibManifestChecksumFileExistenceDiffer)
+        manifest_file = os.path.join(self.project_path, ".ansible-sign", "sha256sum.txt")
+
+        if not os.path.exists(manifest_file):
+            return {
+                "failed": True,
+                "msg": f"Expected file not found: {path}",
+            }
+
+        checksum_file_contents = open(manifest_file, "r").read()
+
+        try:
+            manifest = checksum.parse(checksum_file_contents)
+        except InvalidChecksumLine as e:
+            return {
+                "failed": True,
+                "msg": f"Invalid line in checksum manifest: {e}",
+            }
+
+        try:
+            checksum.verify(manifest)
+        except ChecksumMismatch as e:
+            return {
+                "failed": True,
+                "msg": str(e),
+            }
+
+        return {
+            "failed": False,
+            "msg": "Checksum manifest is valid.",
+        }
--- a/awx/playbooks/library/playbook_integrity.py
+++ b/awx/playbooks/library/playbook_integrity.py
@@ -0,0 +1,65 @@
+ANSIBLE_METADATA = {"metadata_version": "1.0", "status": ["stableinterface"], "supported_by": "community"}
+
+
+DOCUMENTATION = """
+---
+module: playbook_integrity
+short_description: verify that files within a project have not been tampered with.
+description:
+  - Makes use of the 'ansible-sign' project as a library for ensuring that an
+    Ansible project has not been tampered with.
+  - There are multiple types of validation that this action plugin supports,
+    currently: GPG public/private key signing of a checksum manifest file, and
+    checking the checksum manifest file itself against the checksum of each file
+    that is being verified.
+  - In the future, other types of validation may be supported.
+options:
+  project_path:
+    description:
+      - Directory of the project being verified. Expected to contain a
+        C(.ansible-sign) directory with a generated checksum manifest file and a
+        detached signature for it. These files are produced by the
+        C(ansible-sign) command-line utility.
+    required: true
+  validation_type:
+    description:
+      - Describes the kind of validation to perform on the project.
+      - I(validation_type=gpg) means that a GPG Public Key credential is being
+        used to verify the integrity of the checksum manifest (and therefore the
+        project).
+      - 'checksum_manifest' means that the signed checksum manifest is validated
+        against all files in the project listed by its MANIFEST.in file. Just
+        running this plugin with I(validation_type=checksum_manifest) is
+        typically B(NOT) enough. It should also be run with a I(validation_type)
+        that ensures that the manifest file itself has not changed, such as
+        I(validation_type=gpg).
+    required: true
+    choices:
+      - gpg
+      - checksum_manifest
+  gpg_pubkey:
+    description:
+      - The public key to validate a checksum manifest against. Must match the
+        detached signature in the project's C(.ansible-sign) directory.
+      - Required when I(validation_type=gpg).
+author:
+    - Ansible AWX Team
+"""
+
+EXAMPLES = """
+    - name: Verify project content using GPG signature
+      playbook_integrity:
+        project_path: /srv/projects/example
+        validation_type: gpg
+        gpg_pubkey: |
+          -----BEING PGP PUBLIC KEY BLOCK-----
+
+          mWINAFXMtjsACADIf/zJS0V3UO3c+KAUcpVAcChpliM31ICDWydfIfF3dzMzLcCd
+          Cj2kk1mPWtP/JHfk1V5czcWWWWGC2Tw4g4IS+LokAAuwk7VKTlI34eeMl8SiZCAI
+          [...]
+
+    - name: Verify project content against checksum manifest
+      playbook_integrity:
+        project_path: /srv/projects/example
+        validation_type: checksum_manifest
+"""
--- a/awx/playbooks/project_update.yml
+++ b/awx/playbooks/project_update.yml
@@ -18,6 +18,7 @@
 # galaxy_task_env: environment variables to use specifically for ansible-galaxy commands
 # awx_version: Current running version of the awx or tower as a string
 # awx_license_type: "open" for AWX; else presume Tower
+# gpg_pubkey: the GPG public key to use for validation, when enabled

 - hosts: localhost
  gather_facts: false
@@ -153,6 +154,28 @@
        - update_insights
        - update_archive

+- hosts: localhost
+  gather_facts: false
+  connection: local
+  name: Perform project signature/checksum verification
+  tasks:
+    - name: Verify project content using GPG signature
+      playbook_integrity:
+        project_path: "{{ project_path | quote }}"
+        validation_type: gpg
+        gpg_pubkey: "{{ gpg_pubkey }}"
+      register: gpg_result
+      tags:
+        - validation_gpg_public_key
+
+    - name: Verify project content against checksum manifest
+      playbook_integrity:
+        project_path: "{{ project_path | quote }}"
+        validation_type: checksum_manifest
+      register: checksum_result
+      tags:
+        - validation_checksum_manifest
+
 - hosts: localhost
  gather_facts: false
  connection: local
--- a/awx/settings/defaults.py
+++ b/awx/settings/defaults.py
@@ -6,6 +6,7 @@ import os
 import re  # noqa
 import sys
 import tempfile
+import socket
 from datetime import timedelta


@@ -104,7 +105,7 @@ STATICFILES_DIRS = (os.path.join(BASE_DIR, 'ui', 'build', 'static'), os.path.joi

 # Absolute filesystem path to the directory where static file are collected via
 # the collectstatic command.
-STATIC_ROOT = os.path.join(BASE_DIR, 'public', 'static')
+STATIC_ROOT = '/var/lib/awx/public/static'

 # Static files (CSS, JavaScript, Images)
 # https://docs.djangoproject.com/en/dev/howto/static-files/
@@ -248,6 +249,11 @@ SUBSYSTEM_METRICS_TASK_MANAGER_RECORD_INTERVAL = 15
 # The maximum allowed jobs to start on a given task manager cycle
 START_TASK_LIMIT = 100

+# Time out task managers if they take longer than this many seconds, plus TASK_MANAGER_TIMEOUT_GRACE_PERIOD
+# We have the grace period so the task manager can bail out before the timeout.
+TASK_MANAGER_TIMEOUT = 300
+TASK_MANAGER_TIMEOUT_GRACE_PERIOD = 60
+
 # Disallow sending session cookies over insecure connections
 SESSION_COOKIE_SECURE = True

@@ -373,6 +379,7 @@ AUTHENTICATION_BACKENDS = (
    'social_core.backends.github_enterprise.GithubEnterpriseOAuth2',
    'social_core.backends.github_enterprise.GithubEnterpriseOrganizationOAuth2',
    'social_core.backends.github_enterprise.GithubEnterpriseTeamOAuth2',
+    'social_core.backends.open_id_connect.OpenIdConnectAuth',
    'social_core.backends.azuread.AzureADOAuth2',
    'awx.sso.backends.SAMLAuth',
    'awx.main.backends.AWXModelBackend',
@@ -427,6 +434,10 @@ os.environ.setdefault('DJANGO_LIVE_TEST_SERVER_ADDRESS', 'localhost:9013-9199')

 # heartbeat period can factor into some forms of logic, so it is maintained as a setting here
 CLUSTER_NODE_HEARTBEAT_PERIOD = 60
+
+# Number of missed heartbeats until a node gets marked as lost
+CLUSTER_NODE_MISSED_HEARTBEAT_TOLERANCE = 2
+
 RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD = 60  # https://github.com/ansible/receptor/blob/aa1d589e154d8a0cb99a220aff8f98faf2273be6/pkg/netceptor/netceptor.go#L34
 EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 30  # once every 30 minutes check if an execution node errors have been resolved

@@ -442,7 +453,8 @@ CELERYBEAT_SCHEDULE = {
        'options': {'expires': 50},
    },
    'gather_analytics': {'task': 'awx.main.tasks.system.gather_analytics', 'schedule': timedelta(minutes=5)},
-    'task_manager': {'task': 'awx.main.scheduler.tasks.run_task_manager', 'schedule': timedelta(seconds=20), 'options': {'expires': 20}},
+    'task_manager': {'task': 'awx.main.scheduler.tasks.task_manager', 'schedule': timedelta(seconds=20), 'options': {'expires': 20}},
+    'dependency_manager': {'task': 'awx.main.scheduler.tasks.dependency_manager', 'schedule': timedelta(seconds=20), 'options': {'expires': 20}},
    'k8s_reaper': {'task': 'awx.main.tasks.system.awx_k8s_reaper', 'schedule': timedelta(seconds=60), 'options': {'expires': 50}},
    'receptor_reaper': {'task': 'awx.main.tasks.system.awx_receptor_workunit_reaper', 'schedule': timedelta(seconds=60)},
    'send_subsystem_metrics': {'task': 'awx.main.analytics.analytics_tasks.send_subsystem_metrics', 'schedule': timedelta(seconds=20)},
@@ -807,86 +819,25 @@ LOGGING = {
        'dispatcher': {'format': '%(asctime)s %(levelname)-8s [%(guid)s] %(name)s PID:%(process)d %(message)s'},
        'job_lifecycle': {'()': 'awx.main.utils.formatters.JobLifeCycleFormatter'},
    },
+    # Extended below based on install scenario. You probably don't want to add something directly here.
+    # See 'handler_config' below.
    'handlers': {
        'console': {
            '()': 'logging.StreamHandler',
            'level': 'DEBUG',
-            'filters': ['require_debug_true_or_test', 'dynamic_level_filter', 'guid'],
+            'filters': ['dynamic_level_filter', 'guid'],
            'formatter': 'simple',
        },
        'null': {'class': 'logging.NullHandler'},
        'file': {'class': 'logging.NullHandler', 'formatter': 'simple'},
        'syslog': {'level': 'WARNING', 'filters': ['require_debug_false'], 'class': 'logging.NullHandler', 'formatter': 'simple'},
+        'inventory_import': {'level': 'DEBUG', 'class': 'logging.StreamHandler', 'formatter': 'timed_import'},
        'external_logger': {
            'class': 'awx.main.utils.handlers.RSysLogHandler',
            'formatter': 'json',
            'address': '/var/run/awx-rsyslog/rsyslog.sock',
            'filters': ['external_log_enabled', 'dynamic_level_filter', 'guid'],
        },
-        'tower_warnings': {
-            # don't define a level here, it's set by settings.LOG_AGGREGATOR_LEVEL
-            'class': 'logging.handlers.WatchedFileHandler',
-            'filters': ['require_debug_false', 'dynamic_level_filter', 'guid'],
-            'filename': os.path.join(LOG_ROOT, 'tower.log'),
-            'formatter': 'simple',
-        },
-        'callback_receiver': {
-            # don't define a level here, it's set by settings.LOG_AGGREGATOR_LEVEL
-            'class': 'logging.handlers.WatchedFileHandler',
-            'filters': ['require_debug_false', 'dynamic_level_filter', 'guid'],
-            'filename': os.path.join(LOG_ROOT, 'callback_receiver.log'),
-            'formatter': 'simple',
-        },
-        'dispatcher': {
-            # don't define a level here, it's set by settings.LOG_AGGREGATOR_LEVEL
-            'class': 'logging.handlers.WatchedFileHandler',
-            'filters': ['require_debug_false', 'dynamic_level_filter', 'guid'],
-            'filename': os.path.join(LOG_ROOT, 'dispatcher.log'),
-            'formatter': 'dispatcher',
-        },
-        'wsbroadcast': {
-            # don't define a level here, it's set by settings.LOG_AGGREGATOR_LEVEL
-            'class': 'logging.handlers.WatchedFileHandler',
-            'filters': ['require_debug_false', 'dynamic_level_filter', 'guid'],
-            'filename': os.path.join(LOG_ROOT, 'wsbroadcast.log'),
-            'formatter': 'simple',
-        },
-        'celery.beat': {'class': 'logging.StreamHandler', 'level': 'ERROR'},  # don't log every celerybeat wakeup
-        'inventory_import': {'level': 'DEBUG', 'class': 'logging.StreamHandler', 'formatter': 'timed_import'},
-        'task_system': {
-            # don't define a level here, it's set by settings.LOG_AGGREGATOR_LEVEL
-            'class': 'logging.handlers.WatchedFileHandler',
-            'filters': ['require_debug_false', 'dynamic_level_filter', 'guid'],
-            'filename': os.path.join(LOG_ROOT, 'task_system.log'),
-            'formatter': 'simple',
-        },
-        'management_playbooks': {
-            'level': 'DEBUG',
-            'class': 'logging.handlers.WatchedFileHandler',
-            'filters': ['require_debug_false'],
-            'filename': os.path.join(LOG_ROOT, 'management_playbooks.log'),
-            'formatter': 'simple',
-        },
-        'system_tracking_migrations': {
-            'level': 'WARNING',
-            'class': 'logging.handlers.WatchedFileHandler',
-            'filters': ['require_debug_false'],
-            'filename': os.path.join(LOG_ROOT, 'tower_system_tracking_migrations.log'),
-            'formatter': 'simple',
-        },
-        'rbac_migrations': {
-            'level': 'WARNING',
-            'class': 'logging.handlers.WatchedFileHandler',
-            'filters': ['require_debug_false'],
-            'filename': os.path.join(LOG_ROOT, 'tower_rbac_migrations.log'),
-            'formatter': 'simple',
-        },
-        'job_lifecycle': {
-            'level': 'DEBUG',
-            'class': 'logging.handlers.WatchedFileHandler',
-            'filename': os.path.join(LOG_ROOT, 'job_lifecycle.log'),
-            'formatter': 'job_lifecycle',
-        },
    },
    'loggers': {
        'django': {'handlers': ['console']},
@@ -919,6 +870,40 @@ LOGGING = {
    },
 }

+# Log handler configuration. Keys are the name of the handler. Be mindful when renaming things here.
+# People might have created custom settings files that augments the behavior of these.
+# Specify 'filename' (used if the environment variable AWX_LOGGING_MODE is unset or 'file')
+# and an optional 'formatter'. If no formatter is specified, 'simple' is used.
+handler_config = {
+    'tower_warnings': {'filename': 'tower.log'},
+    'callback_receiver': {'filename': 'callback_receiver.log'},
+    'dispatcher': {'filename': 'dispatcher.log', 'formatter': 'dispatcher'},
+    'wsbroadcast': {'filename': 'wsbroadcast.log'},
+    'task_system': {'filename': 'task_system.log'},
+    'rbac_migrations': {'filename': 'tower_rbac_migrations.log'},
+    'job_lifecycle': {'filename': 'job_lifecycle.log', 'formatter': 'job_lifecycle'},
+}
+
+# If running on a VM, we log to files. When running in a container, we log to stdout.
+logging_mode = os.getenv('AWX_LOGGING_MODE', 'file')
+if logging_mode not in ('file', 'stdout'):
+    raise Exception("AWX_LOGGING_MODE must be 'file' or 'stdout'")
+
+for name, config in handler_config.items():
+    # Common log handler config. Don't define a level here, it's set by settings.LOG_AGGREGATOR_LEVEL
+    LOGGING['handlers'][name] = {'filters': ['dynamic_level_filter', 'guid'], 'formatter': config.get('formatter', 'simple')}
+
+    if logging_mode == 'file':
+        LOGGING['handlers'][name]['class'] = 'logging.handlers.WatchedFileHandler'
+        LOGGING['handlers'][name]['filename'] = os.path.join(LOG_ROOT, config['filename'])
+
+    if logging_mode == 'stdout':
+        LOGGING['handlers'][name]['class'] = 'logging.NullHandler'
+
+# Prevents logging to stdout on traditional VM installs
+if logging_mode == 'file':
+    LOGGING['handlers']['console']['filters'].insert(0, 'require_debug_true_or_test')
+
 # Apply coloring to messages logged to the console
 COLOR_LOGS = False

@@ -1018,3 +1003,17 @@ DEFAULT_CONTAINER_RUN_OPTIONS = ['--network', 'slirp4netns:enable_ipv6=true']

 # Mount exposed paths as hostPath resource in k8s/ocp
 AWX_MOUNT_ISOLATED_PATHS_ON_K8S = False
+
+# Time out task managers if they take longer than this many seconds
+TASK_MANAGER_TIMEOUT = 300
+
+# Number of seconds _in addition to_ the task manager timeout a job can stay
+# in waiting without being reaped
+JOB_WAITING_GRACE_PERIOD = 60
+
+# Number of seconds after a container group job finished time to wait
+# before the awx_k8s_reaper task will tear down the pods
+K8S_POD_REAPER_GRACE_PERIOD = 60
+
+# This is overridden downstream via /etc/tower/conf.d/cluster_host_id.py
+CLUSTER_HOST_ID = socket.gethostname()
--- a/awx/settings/development.py
+++ b/awx/settings/development.py
@@ -78,18 +78,6 @@ include(optional('/etc/tower/conf.d/*.py'), scope=locals())
 BASE_VENV_PATH = "/var/lib/awx/venv/"
 AWX_VENV_PATH = os.path.join(BASE_VENV_PATH, "awx")

-# If any local_*.py files are present in awx/settings/, use them to override
-# default settings for development.  If not present, we can still run using
-# only the defaults.
-try:
-    if os.getenv('AWX_KUBE_DEVEL', False):
-        include(optional('minikube.py'), scope=locals())
-    else:
-        include(optional('local_*.py'), scope=locals())
-except ImportError:
-    traceback.print_exc()
-    sys.exit(1)
-
 # Use SQLite for unit tests instead of PostgreSQL.  If the lines below are
 # commented out, Django will create the test_awx-dev database in PostgreSQL to
 # run unit tests.
@@ -110,5 +98,25 @@ CLUSTER_HOST_ID = socket.gethostname()

 AWX_CALLBACK_PROFILE = True

+# ======================!!!!!!! FOR DEVELOPMENT ONLY !!!!!!!=================================
+# Disable normal scheduled/triggered task managers (DependencyManager, TaskManager, WorkflowManager).
+# Allows user to trigger task managers directly for debugging and profiling purposes.
+# Only works in combination with settings.SETTINGS_MODULE == 'awx.settings.development'
+AWX_DISABLE_TASK_MANAGERS = False
+# ======================!!!!!!! FOR DEVELOPMENT ONLY !!!!!!!=================================
+
 if 'sqlite3' not in DATABASES['default']['ENGINE']:  # noqa
    DATABASES['default'].setdefault('OPTIONS', dict()).setdefault('application_name', f'{CLUSTER_HOST_ID}-{os.getpid()}-{" ".join(sys.argv)}'[:63])  # noqa
+
+# If any local_*.py files are present in awx/settings/, use them to override
+# default settings for development.  If not present, we can still run using
+# only the defaults.
+# this needs to stay at the bottom of this file
+try:
+    if os.getenv('AWX_KUBE_DEVEL', False):
+        include(optional('minikube.py'), scope=locals())
+    else:
+        include(optional('local_*.py'), scope=locals())
+except ImportError:
+    traceback.print_exc()
+    sys.exit(1)
--- a/awx/sso/backends.py
+++ b/awx/sso/backends.py
@@ -68,6 +68,7 @@ class LDAPSettings(BaseLDAPSettings):


 class LDAPBackend(BaseLDAPBackend):
+
    """
    Custom LDAP backend for AWX.
    """
@@ -116,7 +117,17 @@ class LDAPBackend(BaseLDAPBackend):
            for setting_name, type_ in [('GROUP_SEARCH', 'LDAPSearch'), ('GROUP_TYPE', 'LDAPGroupType')]:
                if getattr(self.settings, setting_name) is None:
                    raise ImproperlyConfigured("{} must be an {} instance.".format(setting_name, type_))
-            return super(LDAPBackend, self).authenticate(request, username, password)
+            ldap_user = super(LDAPBackend, self).authenticate(request, username, password)
+            # If we have an LDAP user and that user we found has an ldap_user internal object and that object has a bound connection
+            # Then we can try and force an unbind to close the sticky connection
+            if ldap_user and ldap_user.ldap_user and ldap_user.ldap_user._connection_bound:
+                logger.debug("Forcing LDAP connection to close")
+                try:
+                    ldap_user.ldap_user._connection.unbind_s()
+                    ldap_user.ldap_user._connection_bound = False
+                except Exception:
+                    logger.exception(f"Got unexpected LDAP exception when forcing LDAP disconnect for user {ldap_user}, login will still proceed")
+            return ldap_user
        except Exception:
            logger.exception("Encountered an error authenticating to LDAP")
            return None
--- a/awx/sso/conf.py
+++ b/awx/sso/conf.py
@@ -1215,6 +1215,54 @@ register(
    placeholder=SOCIAL_AUTH_TEAM_MAP_PLACEHOLDER,
 )

+###############################################################################
+# Generic OIDC AUTHENTICATION SETTINGS
+###############################################################################
+
+register(
+    'SOCIAL_AUTH_OIDC_KEY',
+    field_class=fields.CharField,
+    allow_null=False,
+    default=None,
+    label=_('OIDC Key'),
+    help_text='The OIDC key (Client ID) from your IDP.',
+    category=_('Generic OIDC'),
+    category_slug='oidc',
+)
+
+register(
+    'SOCIAL_AUTH_OIDC_SECRET',
+    field_class=fields.CharField,
+    allow_blank=True,
+    default='',
+    label=_('OIDC Secret'),
+    help_text=_('The OIDC secret (Client Secret) from your IDP.'),
+    category=_('Generic OIDC'),
+    category_slug='oidc',
+    encrypted=True,
+)
+
+register(
+    'SOCIAL_AUTH_OIDC_OIDC_ENDPOINT',
+    field_class=fields.CharField,
+    allow_blank=True,
+    default='',
+    label=_('OIDC Provider URL'),
+    help_text=_('The URL for your OIDC provider including the path up to /.well-known/openid-configuration'),
+    category=_('Generic OIDC'),
+    category_slug='oidc',
+)
+
+register(
+    'SOCIAL_AUTH_OIDC_VERIFY_SSL',
+    field_class=fields.BooleanField,
+    default=True,
+    label=_('Verify OIDC Provider Certificate'),
+    help_text=_('Verify the OIDV provider ssl certificate.'),
+    category=_('Generic OIDC'),
+    category_slug='oidc',
+)
+
 ###############################################################################
 # SAML AUTHENTICATION SETTINGS
 ###############################################################################
@@ -1535,12 +1583,12 @@ register(
    category_slug='saml',
    placeholder=[
        ('is_superuser_attr', 'saml_attr'),
-        ('is_superuser_value', 'value'),
-        ('is_superuser_role', 'saml_role'),
+        ('is_superuser_value', ['value']),
+        ('is_superuser_role', ['saml_role']),
        ('remove_superusers', True),
        ('is_system_auditor_attr', 'saml_attr'),
-        ('is_system_auditor_value', 'value'),
-        ('is_system_auditor_role', 'saml_role'),
+        ('is_system_auditor_value', ['value']),
+        ('is_system_auditor_role', ['saml_role']),
        ('remove_system_auditors', True),
    ],
 )
--- a/awx/sso/fields.py
+++ b/awx/sso/fields.py
@@ -149,6 +149,7 @@ class AuthenticationBackendsField(fields.StringListField):
            ('awx.sso.backends.RADIUSBackend', ['RADIUS_SERVER']),
            ('social_core.backends.google.GoogleOAuth2', ['SOCIAL_AUTH_GOOGLE_OAUTH2_KEY', 'SOCIAL_AUTH_GOOGLE_OAUTH2_SECRET']),
            ('social_core.backends.github.GithubOAuth2', ['SOCIAL_AUTH_GITHUB_KEY', 'SOCIAL_AUTH_GITHUB_SECRET']),
+            ('social_core.backends.open_id_connect.OpenIdConnectAuth', ['SOCIAL_AUTH_OIDC_KEY', 'SOCIAL_AUTH_OIDC_SECRET', 'SOCIAL_AUTH_OIDC_OIDC_ENDPOINT']),
            (
                'social_core.backends.github.GithubOrganizationOAuth2',
                ['SOCIAL_AUTH_GITHUB_ORG_KEY', 'SOCIAL_AUTH_GITHUB_ORG_SECRET', 'SOCIAL_AUTH_GITHUB_ORG_NAME'],
@@ -741,12 +742,12 @@ class SAMLTeamAttrField(HybridDictField):
 class SAMLUserFlagsAttrField(HybridDictField):

    is_superuser_attr = fields.CharField(required=False, allow_null=True)
-    is_superuser_value = fields.CharField(required=False, allow_null=True)
-    is_superuser_role = fields.CharField(required=False, allow_null=True)
+    is_superuser_value = fields.StringListField(required=False, allow_null=True)
+    is_superuser_role = fields.StringListField(required=False, allow_null=True)
    remove_superusers = fields.BooleanField(required=False, allow_null=True)
    is_system_auditor_attr = fields.CharField(required=False, allow_null=True)
-    is_system_auditor_value = fields.CharField(required=False, allow_null=True)
-    is_system_auditor_role = fields.CharField(required=False, allow_null=True)
+    is_system_auditor_value = fields.StringListField(required=False, allow_null=True)
+    is_system_auditor_role = fields.StringListField(required=False, allow_null=True)
    remove_system_auditors = fields.BooleanField(required=False, allow_null=True)

    child = _Forbidden()
--- a/awx/sso/migrations/0003_convert_saml_string_to_list.py
+++ b/awx/sso/migrations/0003_convert_saml_string_to_list.py
@@ -0,0 +1,58 @@
+from django.db import migrations, connection
+import json
+
+_values_to_change = ['is_superuser_value', 'is_superuser_role', 'is_system_auditor_value', 'is_system_auditor_role']
+
+
+def _get_setting():
+    with connection.cursor() as cursor:
+        cursor.execute(f'SELECT value FROM conf_setting WHERE key= %s', ['SOCIAL_AUTH_SAML_USER_FLAGS_BY_ATTR'])
+        row = cursor.fetchone()
+        if row == None:
+            return {}
+        existing_setting = row[0]
+
+    try:
+        existing_json = json.loads(existing_setting)
+    except json.decoder.JSONDecodeError as e:
+        print("Failed to decode existing json setting:")
+        print(existing_setting)
+        raise e
+
+    return existing_json
+
+
+def _set_setting(value):
+    with connection.cursor() as cursor:
+        cursor.execute(f'UPDATE conf_setting SET value = %s WHERE key = %s', [json.dumps(value), 'SOCIAL_AUTH_SAML_USER_FLAGS_BY_ATTR'])
+
+
+def forwards(app, schema_editor):
+    # The Operation should use schema_editor to apply any changes it
+    # wants to make to the database.
+    existing_json = _get_setting()
+    for key in _values_to_change:
+        if existing_json.get(key, None) and isinstance(existing_json.get(key), str):
+            existing_json[key] = [existing_json.get(key)]
+    _set_setting(existing_json)
+
+
+def backwards(app, schema_editor):
+    existing_json = _get_setting()
+    for key in _values_to_change:
+        if existing_json.get(key, None) and not isinstance(existing_json.get(key), str):
+            try:
+                existing_json[key] = existing_json.get(key).pop()
+            except IndexError:
+                existing_json[key] = ""
+    _set_setting(existing_json)
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ('sso', '0002_expand_provider_options'),
+    ]
+
+    operations = [
+        migrations.RunPython(forwards, backwards),
+    ]
--- a/awx/sso/pipeline.py
+++ b/awx/sso/pipeline.py
@@ -250,7 +250,25 @@ def update_user_teams_by_saml_attr(backend, details, user=None, *args, **kwargs)
        [t.member_role.members.remove(user) for t in Team.objects.filter(Q(member_role__members=user) & ~Q(id__in=team_ids))]


+def _get_matches(list1, list2):
+    # Because we are just doing an intersection here we don't really care which list is in which parameter
+
+    # A SAML provider could return either a string or a list of items so we need to coerce the SAML value into a list (if needed)
+    if not isinstance(list1, (list, tuple)):
+        list1 = [list1]
+
+    # In addition, we used to allow strings in the SAML config instead of Lists. The migration should take case of that but just in case, we will convert our list too
+    if not isinstance(list2, (list, tuple)):
+        list2 = [list2]
+
+    return set(list1).intersection(set(list2))
+
+
 def _check_flag(user, flag, attributes, user_flags_settings):
+    '''
+    Helper function to set the is_superuser is_system_auditor flags for the SAML adapter
+    Returns the new flag and whether or not it changed the flag
+    '''
    new_flag = False
    is_role_key = "is_%s_role" % (flag)
    is_attr_key = "is_%s_attr" % (flag)
@@ -258,37 +276,35 @@ def _check_flag(user, flag, attributes, user_flags_settings):
    remove_setting = "remove_%ss" % (flag)

    # Check to see if we are respecting a role and, if so, does our user have that role?
-    role_setting = user_flags_settings.get(is_role_key, None)
-    if role_setting:
+    required_roles = user_flags_settings.get(is_role_key, None)
+    if required_roles:
+        matching_roles = _get_matches(required_roles, attributes.get('Role', []))
+
        # We do a 2 layer check here so that we don't spit out the else message if there is no role defined
-        if role_setting in attributes.get('Role', []):
-            logger.debug("User %s has %s role %s" % (user.username, flag, role_setting))
+        if matching_roles:
+            logger.debug("User %s has %s role(s) %s" % (user.username, flag, ', '.join(matching_roles)))
            new_flag = True
        else:
-            logger.debug("User %s is missing the %s role %s" % (user.username, flag, role_setting))
+            logger.debug("User %s is missing the %s role(s) %s" % (user.username, flag, ', '.join(required_roles)))

    # Next, check to see if we are respecting an attribute; this will take priority over the role if its defined
    attr_setting = user_flags_settings.get(is_attr_key, None)
    if attr_setting and attributes.get(attr_setting, None):
        # Do we have a required value for the attribute
-        if user_flags_settings.get(is_value_key, None):
+        required_value = user_flags_settings.get(is_value_key, None)
+        if required_value:
            # If so, check and see if the value of the attr matches the required value
-            attribute_value = attributes.get(attr_setting, None)
-            attribute_matches = False
-            if isinstance(attribute_value, (list, tuple)):
-                if user_flags_settings.get(is_value_key) in attribute_value:
-                    attribute_matches = True
-            elif attribute_value == user_flags_settings.get(is_value_key):
-                attribute_matches = True
+            saml_user_attribute_value = attributes.get(attr_setting, None)
+            matching_values = _get_matches(required_value, saml_user_attribute_value)

-            if attribute_matches:
-                logger.debug("Giving %s %s from attribute %s with matching value" % (user.username, flag, attr_setting))
+            if matching_values:
+                logger.debug("Giving %s %s from attribute %s with matching values %s" % (user.username, flag, attr_setting, ', '.join(matching_values)))
                new_flag = True
            # if they don't match make sure that new_flag is false
            else:
                logger.debug(
-                    "For %s on %s attr %s (%s) did not match expected value '%s'"
-                    % (flag, user.username, attr_setting, attribute_value, user_flags_settings.get(is_value_key))
+                    "Refusing %s for %s because attr %s (%s) did not match value(s) %s"
+                    % (flag, user.username, attr_setting, ", ".join(saml_user_attribute_value), ', '.join(required_value))
                )
                new_flag = False
        # If there was no required value then we can just allow them in because of the attribute
--- a/awx/sso/tests/functional/test_pipeline.py
+++ b/awx/sso/tests/functional/test_pipeline.py
@@ -446,6 +446,10 @@ class TestSAMLUserFlags:
                (False, False),
                False,
            ),
+            # NOTE: The first handful of tests test role/value as string instead of lists.
+            #       This was from the initial implementation of these fields but the code should be able to handle this
+            #       There are a couple tests at the end of this which will validate arrays in these values.
+            #
            # In this case we will give the user a group to make them an admin
            (
                {'is_superuser_role': 'test-role-1'},
@@ -518,6 +522,30 @@ class TestSAMLUserFlags:
                (True, False),
                True,
            ),
+            # Positive test for multiple values for is_superuser_value
+            (
+                {'is_superuser_attr': 'is_superuser', 'is_superuser_value': ['junk', 'junk2', 'else', 'junk']},
+                (True, True),
+                False,
+            ),
+            # Negative test for multiple values for is_superuser_value
+            (
+                {'is_superuser_attr': 'is_superuser', 'is_superuser_value': ['junk', 'junk2', 'junk']},
+                (False, True),
+                True,
+            ),
+            # Positive test for multiple values of is_superuser_role
+            (
+                {'is_superuser_role': ['junk', 'junk2', 'something', 'junk']},
+                (True, True),
+                False,
+            ),
+            # Negative test for multiple values of is_superuser_role
+            (
+                {'is_superuser_role': ['junk', 'junk2', 'junk']},
+                (False, True),
+                True,
+            ),
        ],
    )
    def test__check_flag(self, user_flags_settings, expected, is_superuser):
--- a/awx/sso/tests/unit/test_fields.py
+++ b/awx/sso/tests/unit/test_fields.py
@@ -121,12 +121,12 @@ class TestSAMLUserFlagsAttrField:
        [
            {},
            {'is_superuser_attr': 'something'},
-            {'is_superuser_value': 'value'},
-            {'is_superuser_role': 'my_peeps'},
+            {'is_superuser_value': ['value']},
+            {'is_superuser_role': ['my_peeps']},
            {'remove_superusers': False},
            {'is_system_auditor_attr': 'something_else'},
-            {'is_system_auditor_value': 'value2'},
-            {'is_system_auditor_role': 'other_peeps'},
+            {'is_system_auditor_value': ['value2']},
+            {'is_system_auditor_role': ['other_peeps']},
            {'remove_system_auditors': False},
        ],
    )
@@ -147,7 +147,13 @@ class TestSAMLUserFlagsAttrField:
                    'is_system_auditor_value': 'value2',
                    'is_system_auditor_role': 'other_peeps',
                },
-                {'junk': ['Invalid field.']},
+                {
+                    'junk': ['Invalid field.'],
+                    'is_superuser_role': ['Expected a list of items but got type "str".'],
+                    'is_superuser_value': ['Expected a list of items but got type "str".'],
+                    'is_system_auditor_role': ['Expected a list of items but got type "str".'],
+                    'is_system_auditor_value': ['Expected a list of items but got type "str".'],
+                },
            ),
            (
                {
--- a/awx/ui/CONTRIBUTING.md
+++ b/awx/ui/CONTRIBUTING.md
@@ -44,7 +44,7 @@ Have questions about this document or anything not covered here? Feel free to re
  - functions should adopt camelCase
  - constructors/classes should adopt PascalCase
  - constants to be exported should adopt UPPERCASE
- For strings, we adopt the `sentence capitalization` since it is a [Patternfly style guide](https://www.patternfly.org/v4/design-guidelines/content/grammar-and-terminology#capitalization).
+- For strings, we adopt the `sentence capitalization` since it is a [Patternfly style guide](https://www.patternfly.org/v4/ux-writing/capitalization).

 ## Setting up your development environment

--- a/awx/ui/package-lock.json
+++ b/awx/ui/package-lock.json
@@ -7,30 +7,30 @@
      "name": "ui",
      "dependencies": {
        "@lingui/react": "3.14.0",
-        "@patternfly/patternfly": "4.202.1",
+        "@patternfly/patternfly": "4.210.2",
        "@patternfly/react-core": "^4.221.3",
        "@patternfly/react-icons": "4.75.1",
-        "@patternfly/react-table": "4.93.1",
-        "ace-builds": "^1.6.0",
+        "@patternfly/react-table": "4.100.8",
+        "ace-builds": "^1.10.1",
        "ansi-to-html": "0.7.2",
        "axios": "0.27.2",
        "codemirror": "^6.0.1",
        "d3": "7.4.4",
        "dagre": "^0.8.4",
-        "dompurify": "2.3.8",
+        "dompurify": "2.4.0",
        "formik": "2.2.9",
        "has-ansi": "5.0.1",
        "html-entities": "2.3.2",
        "js-yaml": "4.1.0",
-        "luxon": "^2.4.0",
-        "prop-types": "^15.6.2",
+        "luxon": "^3.0.3",
+        "prop-types": "^15.8.1",
        "react": "17.0.2",
        "react-ace": "^10.1.0",
        "react-dom": "17.0.2",
        "react-error-boundary": "^3.1.4",
        "react-router-dom": "^5.3.3",
        "react-virtualized": "^9.21.1",
-        "rrule": "2.7.0",
+        "rrule": "2.7.1",
        "styled-components": "5.3.5"
      },
      "devDependencies": {
@@ -3746,18 +3746,18 @@
      "dev": true
    },
    "node_modules/@patternfly/patternfly": {
-      "version": "4.202.1",
-      "resolved": "https://registry.npmjs.org/@patternfly/patternfly/-/patternfly-4.202.1.tgz",
-      "integrity": "sha512-cQiiPqmwJOm9onuTfLPQNRlpAZwDIJ/zVfDQeaFqMQyPJtxtKn3lkphz5xErY5dPs9rR4X94ytQ1I9pkVzaPJQ=="
+      "version": "4.210.2",
+      "resolved": "https://registry.npmjs.org/@patternfly/patternfly/-/patternfly-4.210.2.tgz",
+      "integrity": "sha512-aZiW24Bxi6uVmk5RyNTp+6q6ThtlJZotNRJfWVeGuwu1UlbBuV4DFa1bpjA6jfTZpfEpX2YL5+R+4ZVSCFAVdw=="
    },
    "node_modules/@patternfly/react-core": {
-      "version": "4.224.1",
-      "resolved": "https://registry.npmjs.org/@patternfly/react-core/-/react-core-4.224.1.tgz",
-      "integrity": "sha512-v8wGGNoMGndAScAoE5jeOA5jVgymlLSwttPjQk/Idr0k7roSpOrsM39oXUR5DEgkZee45DW00WKTgmg50PP3FQ==",
+      "version": "4.231.8",
+      "resolved": "https://registry.npmjs.org/@patternfly/react-core/-/react-core-4.231.8.tgz",
+      "integrity": "sha512-2ClqlYCvSADppMfVfkUGIA/8XlO6jX8batoClXLxZDwqGoOfr61XyUgQ6SSlE4w60czoNeX4Nf6cfQKUH4RIKw==",
      "dependencies": {
-        "@patternfly/react-icons": "4.75.1",
-        "@patternfly/react-styles": "^4.74.1",
-        "@patternfly/react-tokens": "^4.76.1",
+        "@patternfly/react-icons": "^4.82.8",
+        "@patternfly/react-styles": "^4.81.8",
+        "@patternfly/react-tokens": "^4.83.8",
        "focus-trap": "6.9.2",
        "react-dropzone": "9.0.0",
        "tippy.js": "5.1.2",
@@ -3768,6 +3768,15 @@
        "react-dom": "^16.8.0 || ^17.0.0"
      }
    },
+    "node_modules/@patternfly/react-core/node_modules/@patternfly/react-icons": {
+      "version": "4.82.8",
+      "resolved": "https://registry.npmjs.org/@patternfly/react-icons/-/react-icons-4.82.8.tgz",
+      "integrity": "sha512-cKixprTiMLZRe/+kmdZ5suvYb9ly9p1f/HjlcNiWBfsiA8ZDEPmxJnVdend/YsafelC8YC9QGcQf97ay5PNhcw==",
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0",
+        "react-dom": "^16.8.0 || ^17.0.0"
+      }
+    },
    "node_modules/@patternfly/react-core/node_modules/tslib": {
      "version": "2.3.1",
      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.3.1.tgz",
@@ -3783,19 +3792,19 @@
      }
    },
    "node_modules/@patternfly/react-styles": {
-      "version": "4.74.1",
-      "resolved": "https://registry.npmjs.org/@patternfly/react-styles/-/react-styles-4.74.1.tgz",
-      "integrity": "sha512-9eWvKrjtrJ3qhJkhY2GQKyYA13u/J0mU1befH49SYbvxZtkbuHdpKmXBAeQoHmcx1hcOKtiYXeKb+dVoRRNx0A=="
+      "version": "4.81.8",
+      "resolved": "https://registry.npmjs.org/@patternfly/react-styles/-/react-styles-4.81.8.tgz",
+      "integrity": "sha512-Q5FiureSSCMIuz+KLMcEm1317TzbXcwmg2q5iNDRKyf/K+5CT6tJp0Wbtk3FlfRvzli4u/7YfXipahia5TL+tA=="
    },
    "node_modules/@patternfly/react-table": {
-      "version": "4.93.1",
-      "resolved": "https://registry.npmjs.org/@patternfly/react-table/-/react-table-4.93.1.tgz",
-      "integrity": "sha512-N/zHkNsY3X3yUXPg6COwdZKAFmTCbWm25qCY2aHjrXlIlE2OKWaYvVag0CcTwPiQhIuCumztr9Y2Uw9uvv0Fsw==",
+      "version": "4.100.8",
+      "resolved": "https://registry.npmjs.org/@patternfly/react-table/-/react-table-4.100.8.tgz",
+      "integrity": "sha512-80XZCZzoYN9gsoufNdXUB/dk33SuWF9lUnOJs7ilezD6noTSD7ARqO1h532eaEPIbPBp4uIVkEUdfGSHd0HJtg==",
      "dependencies": {
-        "@patternfly/react-core": "^4.224.1",
-        "@patternfly/react-icons": "4.75.1",
-        "@patternfly/react-styles": "^4.74.1",
-        "@patternfly/react-tokens": "^4.76.1",
+        "@patternfly/react-core": "^4.231.8",
+        "@patternfly/react-icons": "^4.82.8",
+        "@patternfly/react-styles": "^4.81.8",
+        "@patternfly/react-tokens": "^4.83.8",
        "lodash": "^4.17.19",
        "tslib": "^2.0.0"
      },
@@ -3804,15 +3813,24 @@
        "react-dom": "^16.8.0 || ^17.0.0"
      }
    },
+    "node_modules/@patternfly/react-table/node_modules/@patternfly/react-icons": {
+      "version": "4.82.8",
+      "resolved": "https://registry.npmjs.org/@patternfly/react-icons/-/react-icons-4.82.8.tgz",
+      "integrity": "sha512-cKixprTiMLZRe/+kmdZ5suvYb9ly9p1f/HjlcNiWBfsiA8ZDEPmxJnVdend/YsafelC8YC9QGcQf97ay5PNhcw==",
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0",
+        "react-dom": "^16.8.0 || ^17.0.0"
+      }
+    },
    "node_modules/@patternfly/react-table/node_modules/tslib": {
      "version": "2.4.0",
      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.4.0.tgz",
      "integrity": "sha512-d6xOpEDfsi2CZVlPQzGeux8XMwLT9hssAsaPYExaQMuYskwb+x1x7J371tWlbBdWHroy99KnVB6qIkUbs5X3UQ=="
    },
    "node_modules/@patternfly/react-tokens": {
-      "version": "4.76.1",
-      "resolved": "https://registry.npmjs.org/@patternfly/react-tokens/-/react-tokens-4.76.1.tgz",
-      "integrity": "sha512-gLEezRSzQeflaPu3SCgYmWtuiqDIRtxNNFP1+ES7P2o56YHXJ5o1Pki7LpNCPk/VOzHy2+vRFE/7l+hBEweugw=="
+      "version": "4.83.8",
+      "resolved": "https://registry.npmjs.org/@patternfly/react-tokens/-/react-tokens-4.83.8.tgz",
+      "integrity": "sha512-Z/MHXNY8PQOuBFGUar2yzPVbz3BNJuhB+Dnk5RJcc/iIn3S+VlSru7g6v5jqoV/+a5wLqZtLGEBp8uhCZ7Xkig=="
    },
    "node_modules/@pmmmwh/react-refresh-webpack-plugin": {
      "version": "0.5.4",
@@ -5249,9 +5267,9 @@
      }
    },
    "node_modules/ace-builds": {
-      "version": "1.6.0",
-      "resolved": "https://registry.npmjs.org/ace-builds/-/ace-builds-1.6.0.tgz",
-      "integrity": "sha512-qdkx965G/TA12IK7Zk+iCVDtA9wvhxIGivGc2rsID4UYbY2Bpatwep3ZrBZwj1IB2miU6FodDMqM9Kc1lqDlLg=="
+      "version": "1.10.1",
+      "resolved": "https://registry.npmjs.org/ace-builds/-/ace-builds-1.10.1.tgz",
+      "integrity": "sha512-w8Xj6lZUtOYAquVYvdpZhb0GxXrZ+qpVfgj5LP2FwUbXE8fPrCmfu86FjwOiSphx/8PMbXXVldFLD2+RIXayyA=="
    },
    "node_modules/acorn": {
      "version": "7.4.1",
@@ -6448,14 +6466,20 @@
      }
    },
    "node_modules/caniuse-lite": {
-      "version": "1.0.30001300",
-      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001300.tgz",
-      "integrity": "sha512-cVjiJHWGcNlJi8TZVKNMnvMid3Z3TTdDHmLDzlOdIiZq138Exvo0G+G0wTdVYolxKb4AYwC+38pxodiInVtJSA==",
+      "version": "1.0.30001393",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001393.tgz",
+      "integrity": "sha512-N/od11RX+Gsk+1qY/jbPa0R6zJupEa0lxeBG598EbrtblxVCTJsQwbRBm6+V+rxpc5lHKdsXb9RY83cZIPLseA==",
      "dev": true,
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/browserslist"
-      }
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/browserslist"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/caniuse-lite"
+        }
+      ]
    },
    "node_modules/case-sensitive-paths-webpack-plugin": {
      "version": "2.4.0",
@@ -8271,9 +8295,9 @@
      }
    },
    "node_modules/dompurify": {
-      "version": "2.3.8",
-      "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.3.8.tgz",
-      "integrity": "sha512-eVhaWoVibIzqdGYjwsBWodIQIaXFSB+cKDf4cfxLMsK0xiud6SE+/WCVx/Xw/UwQsa4cS3T2eITcdtmTg2UKcw=="
+      "version": "2.4.0",
+      "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.4.0.tgz",
+      "integrity": "sha512-Be9tbQMZds4a3C6xTmz68NlMfeONA//4dOavl/1rNw50E+/QO0KVpbcU0PcaW0nsQxurXls9ZocqFxk8R2mWEA=="
    },
    "node_modules/domutils": {
      "version": "2.8.0",
@@ -15448,9 +15472,9 @@
      }
    },
    "node_modules/luxon": {
-      "version": "2.4.0",
-      "resolved": "https://registry.npmjs.org/luxon/-/luxon-2.4.0.tgz",
-      "integrity": "sha512-w+NAwWOUL5hO0SgwOHsMBAmZ15SoknmQXhSO0hIbJCAmPKSsGeK8MlmhYh2w6Iib38IxN2M+/ooXWLbeis7GuA==",
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/luxon/-/luxon-3.0.3.tgz",
+      "integrity": "sha512-+EfHWnF+UT7GgTnq5zXg3ldnTKL2zdv7QJgsU5bjjpbH17E3qi/puMhQyJVYuCq+FRkogvB5WB6iVvUr+E4a7w==",
      "engines": {
        "node": ">=12"
      }
@@ -17829,13 +17853,13 @@
      }
    },
    "node_modules/prop-types": {
-      "version": "15.7.2",
-      "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.7.2.tgz",
-      "integrity": "sha512-8QQikdH7//R2vurIJSutZ1smHYTcLpRWEOlHnzcWHmBYrOGUysKwSsrC89BCiFj3CbrfJ/nXFdJepOVrY1GCHQ==",
+      "version": "15.8.1",
+      "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz",
+      "integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==",
      "dependencies": {
        "loose-envify": "^1.4.0",
        "object-assign": "^4.1.1",
-        "react-is": "^16.8.1"
+        "react-is": "^16.13.1"
      }
    },
    "node_modules/prop-types-exact": {
@@ -19414,13 +19438,18 @@
      }
    },
    "node_modules/rrule": {
-      "version": "2.7.0",
-      "resolved": "https://registry.npmjs.org/rrule/-/rrule-2.7.0.tgz",
-      "integrity": "sha512-PnSvdJLHrETO4qQxm9nlDvSxNfbPdDFbgdz2BSHXTP+IzHbdwSNvTHOeN0O9khiy91GjzWXyiVJhnPDOQvejNg==",
+      "version": "2.7.1",
+      "resolved": "https://registry.npmjs.org/rrule/-/rrule-2.7.1.tgz",
+      "integrity": "sha512-4p20u/1U7WqR3Nb1hOUrm0u1nSI7sO93ZUVZEZ5HeF6Gr5OlJuyhwEGRvUHq8ZfrPsq5gfa5b9dqnUs/kPqpIw==",
      "dependencies": {
-        "tslib": "^1.10.0"
+        "tslib": "^2.4.0"
      }
    },
+    "node_modules/rrule/node_modules/tslib": {
+      "version": "2.4.0",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.4.0.tgz",
+      "integrity": "sha512-d6xOpEDfsi2CZVlPQzGeux8XMwLT9hssAsaPYExaQMuYskwb+x1x7J371tWlbBdWHroy99KnVB6qIkUbs5X3UQ=="
+    },
    "node_modules/rst-selector-parser": {
      "version": "2.2.3",
      "resolved": "https://registry.npmjs.org/rst-selector-parser/-/rst-selector-parser-2.2.3.tgz",
@@ -25064,24 +25093,30 @@
      "dev": true
    },
    "@patternfly/patternfly": {
-      "version": "4.202.1",
-      "resolved": "https://registry.npmjs.org/@patternfly/patternfly/-/patternfly-4.202.1.tgz",
-      "integrity": "sha512-cQiiPqmwJOm9onuTfLPQNRlpAZwDIJ/zVfDQeaFqMQyPJtxtKn3lkphz5xErY5dPs9rR4X94ytQ1I9pkVzaPJQ=="
+      "version": "4.210.2",
+      "resolved": "https://registry.npmjs.org/@patternfly/patternfly/-/patternfly-4.210.2.tgz",
+      "integrity": "sha512-aZiW24Bxi6uVmk5RyNTp+6q6ThtlJZotNRJfWVeGuwu1UlbBuV4DFa1bpjA6jfTZpfEpX2YL5+R+4ZVSCFAVdw=="
    },
    "@patternfly/react-core": {
-      "version": "4.224.1",
-      "resolved": "https://registry.npmjs.org/@patternfly/react-core/-/react-core-4.224.1.tgz",
-      "integrity": "sha512-v8wGGNoMGndAScAoE5jeOA5jVgymlLSwttPjQk/Idr0k7roSpOrsM39oXUR5DEgkZee45DW00WKTgmg50PP3FQ==",
+      "version": "4.231.8",
+      "resolved": "https://registry.npmjs.org/@patternfly/react-core/-/react-core-4.231.8.tgz",
+      "integrity": "sha512-2ClqlYCvSADppMfVfkUGIA/8XlO6jX8batoClXLxZDwqGoOfr61XyUgQ6SSlE4w60czoNeX4Nf6cfQKUH4RIKw==",
      "requires": {
-        "@patternfly/react-icons": "4.75.1",
-        "@patternfly/react-styles": "^4.74.1",
-        "@patternfly/react-tokens": "^4.76.1",
+        "@patternfly/react-icons": "^4.82.8",
+        "@patternfly/react-styles": "^4.81.8",
+        "@patternfly/react-tokens": "^4.83.8",
        "focus-trap": "6.9.2",
        "react-dropzone": "9.0.0",
        "tippy.js": "5.1.2",
        "tslib": "^2.0.0"
      },
      "dependencies": {
+        "@patternfly/react-icons": {
+          "version": "4.82.8",
+          "resolved": "https://registry.npmjs.org/@patternfly/react-icons/-/react-icons-4.82.8.tgz",
+          "integrity": "sha512-cKixprTiMLZRe/+kmdZ5suvYb9ly9p1f/HjlcNiWBfsiA8ZDEPmxJnVdend/YsafelC8YC9QGcQf97ay5PNhcw==",
+          "requires": {}
+        },
        "tslib": {
          "version": "2.3.1",
          "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.3.1.tgz",
@@ -25096,23 +25131,29 @@
      "requires": {}
    },
    "@patternfly/react-styles": {
-      "version": "4.74.1",
-      "resolved": "https://registry.npmjs.org/@patternfly/react-styles/-/react-styles-4.74.1.tgz",
-      "integrity": "sha512-9eWvKrjtrJ3qhJkhY2GQKyYA13u/J0mU1befH49SYbvxZtkbuHdpKmXBAeQoHmcx1hcOKtiYXeKb+dVoRRNx0A=="
+      "version": "4.81.8",
+      "resolved": "https://registry.npmjs.org/@patternfly/react-styles/-/react-styles-4.81.8.tgz",
+      "integrity": "sha512-Q5FiureSSCMIuz+KLMcEm1317TzbXcwmg2q5iNDRKyf/K+5CT6tJp0Wbtk3FlfRvzli4u/7YfXipahia5TL+tA=="
    },
    "@patternfly/react-table": {
-      "version": "4.93.1",
-      "resolved": "https://registry.npmjs.org/@patternfly/react-table/-/react-table-4.93.1.tgz",
-      "integrity": "sha512-N/zHkNsY3X3yUXPg6COwdZKAFmTCbWm25qCY2aHjrXlIlE2OKWaYvVag0CcTwPiQhIuCumztr9Y2Uw9uvv0Fsw==",
+      "version": "4.100.8",
+      "resolved": "https://registry.npmjs.org/@patternfly/react-table/-/react-table-4.100.8.tgz",
+      "integrity": "sha512-80XZCZzoYN9gsoufNdXUB/dk33SuWF9lUnOJs7ilezD6noTSD7ARqO1h532eaEPIbPBp4uIVkEUdfGSHd0HJtg==",
      "requires": {
-        "@patternfly/react-core": "^4.224.1",
-        "@patternfly/react-icons": "4.75.1",
-        "@patternfly/react-styles": "^4.74.1",
-        "@patternfly/react-tokens": "^4.76.1",
+        "@patternfly/react-core": "^4.231.8",
+        "@patternfly/react-icons": "^4.82.8",
+        "@patternfly/react-styles": "^4.81.8",
+        "@patternfly/react-tokens": "^4.83.8",
        "lodash": "^4.17.19",
        "tslib": "^2.0.0"
      },
      "dependencies": {
+        "@patternfly/react-icons": {
+          "version": "4.82.8",
+          "resolved": "https://registry.npmjs.org/@patternfly/react-icons/-/react-icons-4.82.8.tgz",
+          "integrity": "sha512-cKixprTiMLZRe/+kmdZ5suvYb9ly9p1f/HjlcNiWBfsiA8ZDEPmxJnVdend/YsafelC8YC9QGcQf97ay5PNhcw==",
+          "requires": {}
+        },
        "tslib": {
          "version": "2.4.0",
          "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.4.0.tgz",
@@ -25121,9 +25162,9 @@
      }
    },
    "@patternfly/react-tokens": {
-      "version": "4.76.1",
-      "resolved": "https://registry.npmjs.org/@patternfly/react-tokens/-/react-tokens-4.76.1.tgz",
-      "integrity": "sha512-gLEezRSzQeflaPu3SCgYmWtuiqDIRtxNNFP1+ES7P2o56YHXJ5o1Pki7LpNCPk/VOzHy2+vRFE/7l+hBEweugw=="
+      "version": "4.83.8",
+      "resolved": "https://registry.npmjs.org/@patternfly/react-tokens/-/react-tokens-4.83.8.tgz",
+      "integrity": "sha512-Z/MHXNY8PQOuBFGUar2yzPVbz3BNJuhB+Dnk5RJcc/iIn3S+VlSru7g6v5jqoV/+a5wLqZtLGEBp8uhCZ7Xkig=="
    },
    "@pmmmwh/react-refresh-webpack-plugin": {
      "version": "0.5.4",
@@ -26302,9 +26343,9 @@
      }
    },
    "ace-builds": {
-      "version": "1.6.0",
-      "resolved": "https://registry.npmjs.org/ace-builds/-/ace-builds-1.6.0.tgz",
-      "integrity": "sha512-qdkx965G/TA12IK7Zk+iCVDtA9wvhxIGivGc2rsID4UYbY2Bpatwep3ZrBZwj1IB2miU6FodDMqM9Kc1lqDlLg=="
+      "version": "1.10.1",
+      "resolved": "https://registry.npmjs.org/ace-builds/-/ace-builds-1.10.1.tgz",
+      "integrity": "sha512-w8Xj6lZUtOYAquVYvdpZhb0GxXrZ+qpVfgj5LP2FwUbXE8fPrCmfu86FjwOiSphx/8PMbXXVldFLD2+RIXayyA=="
    },
    "acorn": {
      "version": "7.4.1",
@@ -27259,9 +27300,9 @@
      }
    },
    "caniuse-lite": {
-      "version": "1.0.30001300",
-      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001300.tgz",
-      "integrity": "sha512-cVjiJHWGcNlJi8TZVKNMnvMid3Z3TTdDHmLDzlOdIiZq138Exvo0G+G0wTdVYolxKb4AYwC+38pxodiInVtJSA==",
+      "version": "1.0.30001393",
+      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001393.tgz",
+      "integrity": "sha512-N/od11RX+Gsk+1qY/jbPa0R6zJupEa0lxeBG598EbrtblxVCTJsQwbRBm6+V+rxpc5lHKdsXb9RY83cZIPLseA==",
      "dev": true
    },
    "case-sensitive-paths-webpack-plugin": {
@@ -28656,9 +28697,9 @@
      }
    },
    "dompurify": {
-      "version": "2.3.8",
-      "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.3.8.tgz",
-      "integrity": "sha512-eVhaWoVibIzqdGYjwsBWodIQIaXFSB+cKDf4cfxLMsK0xiud6SE+/WCVx/Xw/UwQsa4cS3T2eITcdtmTg2UKcw=="
+      "version": "2.4.0",
+      "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.4.0.tgz",
+      "integrity": "sha512-Be9tbQMZds4a3C6xTmz68NlMfeONA//4dOavl/1rNw50E+/QO0KVpbcU0PcaW0nsQxurXls9ZocqFxk8R2mWEA=="
    },
    "domutils": {
      "version": "2.8.0",
@@ -34178,9 +34219,9 @@
      }
    },
    "luxon": {
-      "version": "2.4.0",
-      "resolved": "https://registry.npmjs.org/luxon/-/luxon-2.4.0.tgz",
-      "integrity": "sha512-w+NAwWOUL5hO0SgwOHsMBAmZ15SoknmQXhSO0hIbJCAmPKSsGeK8MlmhYh2w6Iib38IxN2M+/ooXWLbeis7GuA=="
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/luxon/-/luxon-3.0.3.tgz",
+      "integrity": "sha512-+EfHWnF+UT7GgTnq5zXg3ldnTKL2zdv7QJgsU5bjjpbH17E3qi/puMhQyJVYuCq+FRkogvB5WB6iVvUr+E4a7w=="
    },
    "lz-string": {
      "version": "1.4.4",
@@ -35880,13 +35921,13 @@
      }
    },
    "prop-types": {
-      "version": "15.7.2",
-      "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.7.2.tgz",
-      "integrity": "sha512-8QQikdH7//R2vurIJSutZ1smHYTcLpRWEOlHnzcWHmBYrOGUysKwSsrC89BCiFj3CbrfJ/nXFdJepOVrY1GCHQ==",
+      "version": "15.8.1",
+      "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz",
+      "integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==",
      "requires": {
        "loose-envify": "^1.4.0",
        "object-assign": "^4.1.1",
-        "react-is": "^16.8.1"
+        "react-is": "^16.13.1"
      }
    },
    "prop-types-exact": {
@@ -37007,11 +37048,18 @@
      }
    },
    "rrule": {
-      "version": "2.7.0",
-      "resolved": "https://registry.npmjs.org/rrule/-/rrule-2.7.0.tgz",
-      "integrity": "sha512-PnSvdJLHrETO4qQxm9nlDvSxNfbPdDFbgdz2BSHXTP+IzHbdwSNvTHOeN0O9khiy91GjzWXyiVJhnPDOQvejNg==",
+      "version": "2.7.1",
+      "resolved": "https://registry.npmjs.org/rrule/-/rrule-2.7.1.tgz",
+      "integrity": "sha512-4p20u/1U7WqR3Nb1hOUrm0u1nSI7sO93ZUVZEZ5HeF6Gr5OlJuyhwEGRvUHq8ZfrPsq5gfa5b9dqnUs/kPqpIw==",
      "requires": {
-        "tslib": "^1.10.0"
+        "tslib": "^2.4.0"
+      },
+      "dependencies": {
+        "tslib": {
+          "version": "2.4.0",
+          "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.4.0.tgz",
+          "integrity": "sha512-d6xOpEDfsi2CZVlPQzGeux8XMwLT9hssAsaPYExaQMuYskwb+x1x7J371tWlbBdWHroy99KnVB6qIkUbs5X3UQ=="
+        }
      }
    },
    "rst-selector-parser": {
--- a/awx/ui/package.json
+++ b/awx/ui/package.json
@@ -7,30 +7,30 @@
  },
  "dependencies": {
    "@lingui/react": "3.14.0",
-    "@patternfly/patternfly": "4.202.1",
+    "@patternfly/patternfly": "4.210.2",
    "@patternfly/react-core": "^4.221.3",
    "@patternfly/react-icons": "4.75.1",
-    "@patternfly/react-table": "4.93.1",
-    "ace-builds": "^1.6.0",
+    "@patternfly/react-table": "4.100.8",
+    "ace-builds": "^1.10.1",
    "ansi-to-html": "0.7.2",
    "axios": "0.27.2",
    "codemirror": "^6.0.1",
    "d3": "7.4.4",
    "dagre": "^0.8.4",
-    "dompurify": "2.3.8",
+    "dompurify": "2.4.0",
    "formik": "2.2.9",
    "has-ansi": "5.0.1",
    "html-entities": "2.3.2",
    "js-yaml": "4.1.0",
-    "luxon": "^2.4.0",
-    "prop-types": "^15.6.2",
+    "luxon": "^3.0.3",
+    "prop-types": "^15.8.1",
    "react": "17.0.2",
    "react-ace": "^10.1.0",
    "react-dom": "17.0.2",
    "react-error-boundary": "^3.1.4",
    "react-router-dom": "^5.3.3",
    "react-virtualized": "^9.21.1",
-    "rrule": "2.7.0",
+    "rrule": "2.7.1",
    "styled-components": "5.3.5"
  },
  "devDependencies": {
--- a/awx/ui/src/api/models/CredentialTypes.js
+++ b/awx/ui/src/api/models/CredentialTypes.js
@@ -7,7 +7,15 @@ class CredentialTypes extends Base {
  }

  async loadAllTypes(
-    acceptableKinds = ['machine', 'cloud', 'net', 'ssh', 'vault', 'kubernetes']
+    acceptableKinds = [
+      'machine',
+      'cloud',
+      'net',
+      'ssh',
+      'vault',
+      'kubernetes',
+      'cryptography',
+    ]
  ) {
    const pageSize = 200;
    // The number of credential types a user can have is unlimited. In practice, it is unlikely for
--- a/awx/ui/src/components/AdHocCommands/AdHocCommandsWizard.js
+++ b/awx/ui/src/components/AdHocCommands/AdHocCommandsWizard.js
@@ -2,8 +2,6 @@ import React from 'react';
 import { t } from '@lingui/macro';
 import { withFormik, useFormikContext } from 'formik';
 import PropTypes from 'prop-types';
-
-import { VERBOSITY } from 'components/VerbositySelectField';
 import Wizard from '../Wizard';
 import useAdHocLaunchSteps from './useAdHocLaunchSteps';

@@ -62,7 +60,7 @@ const FormikApp = withFormik({
      limit: adHocItemStrings || 'all',
      credentials: [],
      module_args: '',
-      verbosity: VERBOSITY()[0],
+      verbosity: 0,
      forks: 0,
      diff_mode: false,
      become_enabled: '',
--- a/awx/ui/src/components/AppContainer/NavExpandableGroup.js
+++ b/awx/ui/src/components/AppContainer/NavExpandableGroup.js
@@ -20,12 +20,7 @@ function NavExpandableGroup(props) {
  if (routes.length === 1 && groupId === 'settings') {
    const [{ path }] = routes;
    return (
-      <NavItem
-        itemId={groupId}
-        isActive={isActivePath(path)}
-        key={path}
-        // ouiaId={path}
-      >
+      <NavItem itemId={groupId} isActive={isActivePath(path)} key={path}>
        <Link to={path}>{groupTitle}</Link>
      </NavItem>
    );
@@ -40,12 +35,7 @@ function NavExpandableGroup(props) {
      title={groupTitle}
    >
      {routes.map(({ path, title }) => (
-        <NavItem
-          groupId={groupId}
-          isActive={isActivePath(path)}
-          key={path}
-          // ouiaId={path}
-        >
+        <NavItem groupId={groupId} isActive={isActivePath(path)} key={path}>
          <Link to={path}>{title}</Link>
        </NavItem>
      ))}
--- a/awx/ui/src/components/CredentialChip/CredentialChip.js
+++ b/awx/ui/src/components/CredentialChip/CredentialChip.js
@@ -9,6 +9,8 @@ function CredentialChip({ credential, ...props }) {
  let type;
  if (credential.cloud) {
    type = t`Cloud`;
+  } else if (credential.kind === 'gpg_public_key') {
+    type = t`GPG Public Key`;
  } else if (credential.kind === 'aws' || credential.kind === 'ssh') {
    type = credential.kind.toUpperCase();
  } else {
--- a/awx/ui/src/components/DetailList/DetailList.js
+++ b/awx/ui/src/components/DetailList/DetailList.js
@@ -29,4 +29,8 @@ export default styled(DetailList)`
      --column-count: 3;
    }
  `}
+
+  & + & {
+    margin-top: 20px;
+  }
 `;
--- a/awx/ui/src/components/ErrorDetail/ErrorDetail.js
+++ b/awx/ui/src/components/ErrorDetail/ErrorDetail.js
@@ -24,6 +24,7 @@ const CardBody = styled(PFCardBody)`

 const Expandable = styled(PFExpandable)`
  text-align: left;
+  max-width: 75vw;

  & .pf-c-expandable__toggle {
    padding-left: 10px;
@@ -54,7 +55,7 @@ function ErrorDetail({ error }) {
          {response?.config?.method.toUpperCase()} {response?.config?.url}{' '}
          <strong>{response?.status}</strong>
        </CardBody>
-        <CardBody>
+        <CardBody css="max-width: 70vw">
          {Array.isArray(message) ? (
            <ul>
              {message.map((m) =>
@@ -70,9 +71,16 @@ function ErrorDetail({ error }) {
  };

  const renderStack = () => (
-    <CardBody css="white-space: pre; font-family: var(--pf-global--FontFamily--monospace)">
-      {error.stack}
-    </CardBody>
+    <>
+      <CardBody>
+        <strong>
+          {error.name}: {error.message}
+        </strong>
+      </CardBody>
+      <CardBody css="white-space: pre; font-family: var(--pf-global--FontFamily--monospace)">
+        {error.stack}
+      </CardBody>
+    </>
  );

  return (
--- a/Show More
+++ b/Show More