Merge pull request #11955 from shanemcd/fail-better

Increase resiliency when application crashes
This commit is contained in:
Shane McDonald
2022-03-30 08:58:26 -04:00
committed by GitHub
11 changed files with 110 additions and 108 deletions

View File

@@ -35,9 +35,11 @@ def reap(instance=None, status='failed', excluded_uuids=[]):
""" """
me = instance me = instance
if me is None: if me is None:
(changed, me) = Instance.objects.get_or_register() try:
if changed: me = Instance.objects.me()
logger.info("Registered node '{}'".format(me.hostname)) except RuntimeError as e:
logger.warning(f'Local instance is not registered, not running reaper: {e}')
return
now = tz_now() now = tz_now()
workflow_ctype_id = ContentType.objects.get_for_model(WorkflowJob).id workflow_ctype_id = ContentType.objects.get_for_model(WorkflowJob).id
jobs = UnifiedJob.objects.filter( jobs = UnifiedJob.objects.filter(

View File

@@ -3,6 +3,7 @@
from django.core.management.base import BaseCommand, CommandError from django.core.management.base import BaseCommand, CommandError
from django.db import transaction from django.db import transaction
from django.conf import settings
from awx.main.models import Instance from awx.main.models import Instance
@@ -13,7 +14,7 @@ class Command(BaseCommand):
Register this instance with the database for HA tracking. Register this instance with the database for HA tracking.
""" """
help = "Add instance to the database. Specify `--hostname` to use this command." help = "Add instance to the database. When no options are provided, the hostname of the current system will be used. Override with `--hostname`."
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('--hostname', dest='hostname', type=str, help="Hostname used during provisioning") parser.add_argument('--hostname', dest='hostname', type=str, help="Hostname used during provisioning")
@@ -22,8 +23,11 @@ class Command(BaseCommand):
def _register_hostname(self, hostname, node_type, uuid): def _register_hostname(self, hostname, node_type, uuid):
if not hostname: if not hostname:
return if not settings.AWX_AUTO_DEPROVISION_INSTANCES:
(changed, instance) = Instance.objects.register(hostname=hostname, node_type=node_type, uuid=uuid) raise CommandError('Registering with values from settings only intended for use in K8s installs')
(changed, instance) = Instance.objects.get_or_register()
else:
(changed, instance) = Instance.objects.register(hostname=hostname, node_type=node_type, uuid=uuid)
if changed: if changed:
print("Successfully registered instance {}".format(hostname)) print("Successfully registered instance {}".format(hostname))
else: else:
@@ -32,8 +36,6 @@ class Command(BaseCommand):
@transaction.atomic @transaction.atomic
def handle(self, **options): def handle(self, **options):
if not options.get('hostname'):
raise CommandError("Specify `--hostname` to use this command.")
self.changed = False self.changed = False
self._register_hostname(options.get('hostname'), options.get('node_type'), options.get('uuid')) self._register_hostname(options.get('hostname'), options.get('node_type'), options.get('uuid'))
if self.changed: if self.changed:

View File

@@ -490,10 +490,6 @@ def cluster_node_heartbeat():
if inst.hostname == settings.CLUSTER_HOST_ID: if inst.hostname == settings.CLUSTER_HOST_ID:
this_inst = inst this_inst = inst
break break
else:
(changed, this_inst) = Instance.objects.get_or_register()
if changed:
logger.info("Registered tower control node '{}'".format(this_inst.hostname))
inspect_execution_nodes(instance_list) inspect_execution_nodes(instance_list)

View File

@@ -17,4 +17,11 @@ set -e
wait-for-migrations wait-for-migrations
supervisord -c /etc/supervisord.conf # This file will be re-written when the dispatcher calls reconfigure_rsyslog(),
# but it needs to exist when supervisor initially starts rsyslog to prevent the
# container from crashing. This was the most minimal config I could get working.
cat << EOF > /var/lib/awx/rsyslog/rsyslog.conf
action(type="omfile" file="/dev/null")
EOF
exec supervisord -c /etc/supervisord.conf

View File

@@ -17,4 +17,6 @@ set -e
wait-for-migrations wait-for-migrations
supervisord -c /etc/supervisord_task.conf awx-manage provision_instance
exec supervisord -c /etc/supervisord_task.conf

View File

@@ -0,0 +1,8 @@
#!/bin/bash
printf "READY\n";
while read line; do
echo "Processing Event: $line" >&2;
kill -SIGQUIT $PPID
done < /dev/stdin

View File

@@ -120,11 +120,7 @@ RUN curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master
chmod 700 get_helm.sh && \ chmod 700 get_helm.sh && \
./get_helm.sh ./get_helm.sh
# Install tini RUN pip3 install virtualenv supervisor dumb-init
RUN curl -L -o /usr/bin/tini https://github.com/krallin/tini/releases/download/v0.19.0/tini-{{ tini_architecture | default('amd64') }} && \
chmod +x /usr/bin/tini
RUN pip3 install virtualenv supervisor
RUN rm -rf /root/.cache && rm -rf /tmp/* RUN rm -rf /root/.cache && rm -rf /tmp/*
@@ -194,6 +190,7 @@ RUN mkdir -p /etc/containers/registries.conf.d/ && echo "unqualified-search-regi
# Create default awx rsyslog config # Create default awx rsyslog config
ADD tools/ansible/roles/dockerfile/files/rsyslog.conf /var/lib/awx/rsyslog/rsyslog.conf ADD tools/ansible/roles/dockerfile/files/rsyslog.conf /var/lib/awx/rsyslog/rsyslog.conf
ADD tools/ansible/roles/dockerfile/files/wait-for-migrations /usr/local/bin/wait-for-migrations ADD tools/ansible/roles/dockerfile/files/wait-for-migrations /usr/local/bin/wait-for-migrations
ADD tools/ansible/roles/dockerfile/files/stop-supervisor /usr/local/bin/stop-supervisor
## File mappings ## File mappings
{% if build_dev|bool %} {% if build_dev|bool %}
@@ -292,7 +289,7 @@ CMD ["/bin/bash"]
USER 1000 USER 1000
EXPOSE 8052 EXPOSE 8052
ENTRYPOINT ["/usr/bin/tini", "--"] ENTRYPOINT ["dumb-init", "--"]
CMD /usr/bin/launch_awx.sh CMD /usr/bin/launch_awx.sh
VOLUME /var/lib/nginx VOLUME /var/lib/nginx
VOLUME /var/lib/awx/.local/share/containers VOLUME /var/lib/awx/.local/share/containers

View File

@@ -12,9 +12,10 @@ directory = /awx_devel
{% else %} {% else %}
command = nginx -g "daemon off;" command = nginx -g "daemon off;"
{% endif %} {% endif %}
autostart = true
autorestart = true autorestart = true
stopwaitsecs = 5 startsecs = 30
stopasgroup=true
killasgroup=true
stdout_logfile=/dev/stdout stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0 stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr stderr_logfile=/dev/stderr
@@ -31,12 +32,10 @@ environment =
command = /var/lib/awx/venv/awx/bin/uwsgi /etc/tower/uwsgi.ini command = /var/lib/awx/venv/awx/bin/uwsgi /etc/tower/uwsgi.ini
directory = /var/lib/awx directory = /var/lib/awx
{% endif %} {% endif %}
autostart = true
autorestart = true autorestart = true
stopwaitsecs = 15 startsecs = 30
stopasgroup=true stopasgroup=true
killasgroup=true killasgroup=true
stopsignal=KILL
stdout_logfile=/dev/stdout stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0 stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr stderr_logfile=/dev/stderr
@@ -50,10 +49,8 @@ directory = /awx_devel
command = /var/lib/awx/venv/awx/bin/daphne -b 127.0.0.1 -p 8051 --websocket_timeout -1 awx.asgi:channel_layer command = /var/lib/awx/venv/awx/bin/daphne -b 127.0.0.1 -p 8051 --websocket_timeout -1 awx.asgi:channel_layer
directory = /var/lib/awx directory = /var/lib/awx
{% endif %} {% endif %}
autostart = true
stopsignal=KILL
autorestart = true autorestart = true
stopwaitsecs = 5 startsecs = 30
stopasgroup=true stopasgroup=true
killasgroup=true killasgroup=true
stdout_logfile=/dev/stdout stdout_logfile=/dev/stdout
@@ -69,9 +66,8 @@ directory = /awx_devel
command = awx-manage run_wsbroadcast command = awx-manage run_wsbroadcast
directory = /var/lib/awx directory = /var/lib/awx
{% endif %} {% endif %}
autostart = true
autorestart = true autorestart = true
stopwaitsecs = 5 startsecs = 30
stopasgroup=true stopasgroup=true
killasgroup=true killasgroup=true
stdout_logfile=/dev/stdout stdout_logfile=/dev/stdout
@@ -81,33 +77,26 @@ stderr_logfile_maxbytes=0
[program:awx-rsyslogd] [program:awx-rsyslogd]
command = rsyslogd -n -i /var/run/awx-rsyslog/rsyslog.pid -f /var/lib/awx/rsyslog/rsyslog.conf command = rsyslogd -n -i /var/run/awx-rsyslog/rsyslog.pid -f /var/lib/awx/rsyslog/rsyslog.conf
autostart = true
autorestart = true autorestart = true
startretries = 10 startsecs = 30
stopwaitsecs = 5
stopsignal=TERM
stopasgroup=true stopasgroup=true
killasgroup=true killasgroup=true
redirect_stderr=true stdout_logfile=/dev/stdout
stdout_logfile=/dev/stderr
stdout_logfile_maxbytes=0 stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[group:tower-processes] [group:tower-processes]
programs=nginx,uwsgi,daphne,wsbroadcast,awx-rsyslogd programs=nginx,uwsgi,daphne,wsbroadcast,awx-rsyslogd
priority=5 priority=5
# TODO: Exit Handler [eventlistener:superwatcher]
command=stop-supervisor
{% if kube_dev | bool %} events=PROCESS_STATE_FATAL
[eventlistener:awx-config-watcher]
command=/usr/bin/config-watcher
stderr_logfile=/dev/stdout
stderr_logfile_maxbytes=0
stdout_logfile=/dev/stdout stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0 stdout_logfile_maxbytes=0
events=TICK_60 stderr_logfile=/dev/stderr
priority=0 stderr_logfile_maxbytes=0
{% endif %}
[unix_http_server] [unix_http_server]
file=/var/run/supervisor/supervisor.web.sock file=/var/run/supervisor/supervisor.web.sock

View File

@@ -13,9 +13,8 @@ directory = /awx_devel
command = awx-manage run_dispatcher command = awx-manage run_dispatcher
directory = /var/lib/awx directory = /var/lib/awx
{% endif %} {% endif %}
autostart = true
autorestart = true autorestart = true
stopwaitsecs = 5 startsecs = 30
stopasgroup=true stopasgroup=true
killasgroup=true killasgroup=true
stdout_logfile=/dev/stdout stdout_logfile=/dev/stdout
@@ -31,9 +30,8 @@ directory = /awx_devel
command = awx-manage run_callback_receiver command = awx-manage run_callback_receiver
directory = /var/lib/awx directory = /var/lib/awx
{% endif %} {% endif %}
autostart = true
autorestart = true autorestart = true
stopwaitsecs = 5 startsecs = 30
stopasgroup=true stopasgroup=true
killasgroup=true killasgroup=true
stdout_logfile=/dev/stdout stdout_logfile=/dev/stdout
@@ -45,18 +43,14 @@ stderr_logfile_maxbytes=0
programs=dispatcher,callback-receiver programs=dispatcher,callback-receiver
priority=5 priority=5
# TODO: Exit Handler [eventlistener:superwatcher]
command=stop-supervisor
{% if kube_dev | bool %} events=PROCESS_STATE_FATAL
[eventlistener:awx-config-watcher] autorestart = true
command=/usr/bin/config-watcher
stderr_logfile=/dev/stdout
stderr_logfile_maxbytes=0
stdout_logfile=/dev/stdout stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0 stdout_logfile_maxbytes=0
events=TICK_60 stderr_logfile=/dev/stderr
priority=0 stderr_logfile_maxbytes=0
{% endif %}
[unix_http_server] [unix_http_server]
file=/var/run/supervisor/supervisor.sock file=/var/run/supervisor/supervisor.sock

View File

@@ -5,4 +5,4 @@ bootstrap_development.sh
cd /awx_devel cd /awx_devel
# Start the services # Start the services
exec tini -- make supervisor exec make supervisor

View File

@@ -5,79 +5,75 @@ nodaemon=true
[program:awx-dispatcher] [program:awx-dispatcher]
command = make dispatcher command = make dispatcher
autostart = true
autorestart = true autorestart = true
stopwaitsecs = 1 startsecs = 30
stopsignal=KILL
stopasgroup=true stopasgroup=true
killasgroup=true killasgroup=true
redirect_stderr=true stdout_logfile=/dev/stdout
stdout_events_enabled = true stdout_logfile_maxbytes=0
stderr_events_enabled = true stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:awx-receiver] [program:awx-receiver]
command = make receiver command = make receiver
autostart = true
autorestart = true autorestart = true
stopwaitsecs = 1 startsecs = 30
stopsignal=KILL
stopasgroup=true stopasgroup=true
killasgroup=true killasgroup=true
redirect_stderr=true stdout_logfile=/dev/stdout
stdout_events_enabled = true stdout_logfile_maxbytes=0
stderr_events_enabled = true stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:awx-wsbroadcast] [program:awx-wsbroadcast]
command = make wsbroadcast command = make wsbroadcast
autostart = true
autorestart = true autorestart = true
stopwaitsecs = 1 startsecs = 30
stopsignal=KILL autorestart = true
stopasgroup=true stopasgroup=true
killasgroup=true killasgroup=true
redirect_stderr=true stdout_logfile=/dev/stdout
stdout_events_enabled = true stdout_logfile_maxbytes=0
stderr_events_enabled = true stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:awx-uwsgi] [program:awx-uwsgi]
command = make uwsgi command = make uwsgi
autostart = true
autorestart = true autorestart = true
redirect_stderr=true startsecs = 30
stopwaitsecs = 1
stopsignal=KILL
stopasgroup=true stopasgroup=true
killasgroup=true killasgroup=true
stdout_events_enabled = true stdout_logfile=/dev/stdout
stderr_events_enabled = true stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:awx-daphne] [program:awx-daphne]
command = make daphne command = make daphne
autostart = true
autorestart = true autorestart = true
redirect_stderr=true startsecs = 30
stopwaitsecs = 1
stopsignal=KILL
stopasgroup=true stopasgroup=true
killasgroup=true killasgroup=true
stdout_events_enabled = true stdout_logfile=/dev/stdout
stderr_events_enabled = true stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:awx-nginx] [program:awx-nginx]
command = make nginx command = make nginx
autostart = true
autorestart = true autorestart = true
redirect_stderr=true startsecs = 30
stdout_events_enabled = true stopasgroup=true
stderr_events_enabled = true killasgroup=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:awx-rsyslogd] [program:awx-rsyslogd]
command = rsyslogd -n -i /var/run/awx-rsyslog/rsyslog.pid -f /var/lib/awx/rsyslog/rsyslog.conf command = rsyslogd -n -i /var/run/awx-rsyslog/rsyslog.pid -f /var/lib/awx/rsyslog/rsyslog.conf
autostart = true
autorestart = true autorestart = true
stopwaitsecs = 5 startsecs = 30
stopsignal=TERM
stopasgroup=true stopasgroup=true
killasgroup=true killasgroup=true
redirect_stderr=true redirect_stderr=true
@@ -86,19 +82,28 @@ stderr_events_enabled = true
[program:awx-receptor] [program:awx-receptor]
command = receptor --config /etc/receptor/receptor.conf command = receptor --config /etc/receptor/receptor.conf
autostart = true
autorestart = true autorestart = true
stopsignal = KILL startsecs = 30
stopasgroup = true stopasgroup=true
killasgroup = true killasgroup=true
redirect_stderr=true stdout_logfile=/dev/stdout
stdout_events_enabled = true stdout_logfile_maxbytes=0
stderr_events_enabled = true stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[group:tower-processes] [group:tower-processes]
programs=awx-dispatcher,awx-receiver,awx-uwsgi,awx-daphne,awx-nginx,awx-wsbroadcast,awx-rsyslogd programs=awx-dispatcher,awx-receiver,awx-uwsgi,awx-daphne,awx-nginx,awx-wsbroadcast,awx-rsyslogd
priority=5 priority=5
[eventlistener:superwatcher]
command=stop-supervisor
events=PROCESS_STATE_FATAL
autorestart = true
stderr_logfile=/dev/stdout
stderr_logfile_maxbytes=0
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
[unix_http_server] [unix_http_server]
file=/var/run/supervisor/supervisor.sock file=/var/run/supervisor/supervisor.sock