Merge pull request #11955 from shanemcd/fail-better

Increase resiliency when application crashes
This commit is contained in:
Shane McDonald 2022-03-30 08:58:26 -04:00 committed by GitHub
commit ef0f6ca248
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 110 additions and 108 deletions

View File

@ -35,9 +35,11 @@ def reap(instance=None, status='failed', excluded_uuids=[]):
"""
me = instance
if me is None:
(changed, me) = Instance.objects.get_or_register()
if changed:
logger.info("Registered node '{}'".format(me.hostname))
try:
me = Instance.objects.me()
except RuntimeError as e:
logger.warning(f'Local instance is not registered, not running reaper: {e}')
return
now = tz_now()
workflow_ctype_id = ContentType.objects.get_for_model(WorkflowJob).id
jobs = UnifiedJob.objects.filter(

View File

@ -3,6 +3,7 @@
from django.core.management.base import BaseCommand, CommandError
from django.db import transaction
from django.conf import settings
from awx.main.models import Instance
@ -13,7 +14,7 @@ class Command(BaseCommand):
Register this instance with the database for HA tracking.
"""
help = "Add instance to the database. Specify `--hostname` to use this command."
help = "Add instance to the database. When no options are provided, the hostname of the current system will be used. Override with `--hostname`."
def add_arguments(self, parser):
parser.add_argument('--hostname', dest='hostname', type=str, help="Hostname used during provisioning")
@ -22,8 +23,11 @@ class Command(BaseCommand):
def _register_hostname(self, hostname, node_type, uuid):
if not hostname:
return
(changed, instance) = Instance.objects.register(hostname=hostname, node_type=node_type, uuid=uuid)
if not settings.AWX_AUTO_DEPROVISION_INSTANCES:
raise CommandError('Registering with values from settings only intended for use in K8s installs')
(changed, instance) = Instance.objects.get_or_register()
else:
(changed, instance) = Instance.objects.register(hostname=hostname, node_type=node_type, uuid=uuid)
if changed:
print("Successfully registered instance {}".format(hostname))
else:
@ -32,8 +36,6 @@ class Command(BaseCommand):
@transaction.atomic
def handle(self, **options):
if not options.get('hostname'):
raise CommandError("Specify `--hostname` to use this command.")
self.changed = False
self._register_hostname(options.get('hostname'), options.get('node_type'), options.get('uuid'))
if self.changed:

View File

@ -490,10 +490,6 @@ def cluster_node_heartbeat():
if inst.hostname == settings.CLUSTER_HOST_ID:
this_inst = inst
break
else:
(changed, this_inst) = Instance.objects.get_or_register()
if changed:
logger.info("Registered tower control node '{}'".format(this_inst.hostname))
inspect_execution_nodes(instance_list)

View File

@ -17,4 +17,11 @@ set -e
wait-for-migrations
supervisord -c /etc/supervisord.conf
# This file will be re-written when the dispatcher calls reconfigure_rsyslog(),
# but it needs to exist when supervisor initially starts rsyslog to prevent the
# container from crashing. This was the most minimal config I could get working.
cat << EOF > /var/lib/awx/rsyslog/rsyslog.conf
action(type="omfile" file="/dev/null")
EOF
exec supervisord -c /etc/supervisord.conf

View File

@ -17,4 +17,6 @@ set -e
wait-for-migrations
supervisord -c /etc/supervisord_task.conf
awx-manage provision_instance
exec supervisord -c /etc/supervisord_task.conf

View File

@ -0,0 +1,8 @@
#!/bin/bash
printf "READY\n";
while read line; do
echo "Processing Event: $line" >&2;
kill -SIGQUIT $PPID
done < /dev/stdin

View File

@ -120,11 +120,7 @@ RUN curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master
chmod 700 get_helm.sh && \
./get_helm.sh
# Install tini
RUN curl -L -o /usr/bin/tini https://github.com/krallin/tini/releases/download/v0.19.0/tini-{{ tini_architecture | default('amd64') }} && \
chmod +x /usr/bin/tini
RUN pip3 install virtualenv supervisor
RUN pip3 install virtualenv supervisor dumb-init
RUN rm -rf /root/.cache && rm -rf /tmp/*
@ -194,6 +190,7 @@ RUN mkdir -p /etc/containers/registries.conf.d/ && echo "unqualified-search-regi
# Create default awx rsyslog config
ADD tools/ansible/roles/dockerfile/files/rsyslog.conf /var/lib/awx/rsyslog/rsyslog.conf
ADD tools/ansible/roles/dockerfile/files/wait-for-migrations /usr/local/bin/wait-for-migrations
ADD tools/ansible/roles/dockerfile/files/stop-supervisor /usr/local/bin/stop-supervisor
## File mappings
{% if build_dev|bool %}
@ -264,9 +261,9 @@ RUN for dir in \
for file in \
/etc/containers/containers.conf \
/var/lib/awx/.config/containers/containers.conf \
/var/lib/shared/overlay-images/images.lock \
/var/lib/shared/overlay-images/images.lock \
/var/lib/shared/overlay-layers/layers.lock \
/var/lib/shared/vfs-images/images.lock \
/var/lib/shared/vfs-images/images.lock \
/var/lib/shared/vfs-layers/layers.lock \
/var/run/nginx.pid \
/var/lib/awx/venv/awx/lib/python3.9/site-packages/awx.egg-link ; \
@ -292,7 +289,7 @@ CMD ["/bin/bash"]
USER 1000
EXPOSE 8052
ENTRYPOINT ["/usr/bin/tini", "--"]
ENTRYPOINT ["dumb-init", "--"]
CMD /usr/bin/launch_awx.sh
VOLUME /var/lib/nginx
VOLUME /var/lib/awx/.local/share/containers

View File

@ -12,9 +12,10 @@ directory = /awx_devel
{% else %}
command = nginx -g "daemon off;"
{% endif %}
autostart = true
autorestart = true
stopwaitsecs = 5
startsecs = 30
stopasgroup=true
killasgroup=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
@ -31,12 +32,10 @@ environment =
command = /var/lib/awx/venv/awx/bin/uwsgi /etc/tower/uwsgi.ini
directory = /var/lib/awx
{% endif %}
autostart = true
autorestart = true
stopwaitsecs = 15
startsecs = 30
stopasgroup=true
killasgroup=true
stopsignal=KILL
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
@ -50,10 +49,8 @@ directory = /awx_devel
command = /var/lib/awx/venv/awx/bin/daphne -b 127.0.0.1 -p 8051 --websocket_timeout -1 awx.asgi:channel_layer
directory = /var/lib/awx
{% endif %}
autostart = true
stopsignal=KILL
autorestart = true
stopwaitsecs = 5
startsecs = 30
stopasgroup=true
killasgroup=true
stdout_logfile=/dev/stdout
@ -69,9 +66,8 @@ directory = /awx_devel
command = awx-manage run_wsbroadcast
directory = /var/lib/awx
{% endif %}
autostart = true
autorestart = true
stopwaitsecs = 5
startsecs = 30
stopasgroup=true
killasgroup=true
stdout_logfile=/dev/stdout
@ -81,33 +77,26 @@ stderr_logfile_maxbytes=0
[program:awx-rsyslogd]
command = rsyslogd -n -i /var/run/awx-rsyslog/rsyslog.pid -f /var/lib/awx/rsyslog/rsyslog.conf
autostart = true
autorestart = true
startretries = 10
stopwaitsecs = 5
stopsignal=TERM
startsecs = 30
stopasgroup=true
killasgroup=true
redirect_stderr=true
stdout_logfile=/dev/stderr
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[group:tower-processes]
programs=nginx,uwsgi,daphne,wsbroadcast,awx-rsyslogd
priority=5
# TODO: Exit Handler
{% if kube_dev | bool %}
[eventlistener:awx-config-watcher]
command=/usr/bin/config-watcher
stderr_logfile=/dev/stdout
stderr_logfile_maxbytes=0
[eventlistener:superwatcher]
command=stop-supervisor
events=PROCESS_STATE_FATAL
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
events=TICK_60
priority=0
{% endif %}
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[unix_http_server]
file=/var/run/supervisor/supervisor.web.sock

View File

@ -13,9 +13,8 @@ directory = /awx_devel
command = awx-manage run_dispatcher
directory = /var/lib/awx
{% endif %}
autostart = true
autorestart = true
stopwaitsecs = 5
startsecs = 30
stopasgroup=true
killasgroup=true
stdout_logfile=/dev/stdout
@ -31,9 +30,8 @@ directory = /awx_devel
command = awx-manage run_callback_receiver
directory = /var/lib/awx
{% endif %}
autostart = true
autorestart = true
stopwaitsecs = 5
startsecs = 30
stopasgroup=true
killasgroup=true
stdout_logfile=/dev/stdout
@ -45,18 +43,14 @@ stderr_logfile_maxbytes=0
programs=dispatcher,callback-receiver
priority=5
# TODO: Exit Handler
{% if kube_dev | bool %}
[eventlistener:awx-config-watcher]
command=/usr/bin/config-watcher
stderr_logfile=/dev/stdout
stderr_logfile_maxbytes=0
[eventlistener:superwatcher]
command=stop-supervisor
events=PROCESS_STATE_FATAL
autorestart = true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
events=TICK_60
priority=0
{% endif %}
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[unix_http_server]
file=/var/run/supervisor/supervisor.sock

View File

@ -5,4 +5,4 @@ bootstrap_development.sh
cd /awx_devel
# Start the services
exec tini -- make supervisor
exec make supervisor

View File

@ -5,79 +5,75 @@ nodaemon=true
[program:awx-dispatcher]
command = make dispatcher
autostart = true
autorestart = true
stopwaitsecs = 1
stopsignal=KILL
startsecs = 30
stopasgroup=true
killasgroup=true
redirect_stderr=true
stdout_events_enabled = true
stderr_events_enabled = true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:awx-receiver]
command = make receiver
autostart = true
autorestart = true
stopwaitsecs = 1
stopsignal=KILL
startsecs = 30
stopasgroup=true
killasgroup=true
redirect_stderr=true
stdout_events_enabled = true
stderr_events_enabled = true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:awx-wsbroadcast]
command = make wsbroadcast
autostart = true
autorestart = true
stopwaitsecs = 1
stopsignal=KILL
startsecs = 30
autorestart = true
stopasgroup=true
killasgroup=true
redirect_stderr=true
stdout_events_enabled = true
stderr_events_enabled = true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:awx-uwsgi]
command = make uwsgi
autostart = true
autorestart = true
redirect_stderr=true
stopwaitsecs = 1
stopsignal=KILL
startsecs = 30
stopasgroup=true
killasgroup=true
stdout_events_enabled = true
stderr_events_enabled = true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:awx-daphne]
command = make daphne
autostart = true
autorestart = true
redirect_stderr=true
stopwaitsecs = 1
stopsignal=KILL
startsecs = 30
stopasgroup=true
killasgroup=true
stdout_events_enabled = true
stderr_events_enabled = true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:awx-nginx]
command = make nginx
autostart = true
autorestart = true
redirect_stderr=true
stdout_events_enabled = true
stderr_events_enabled = true
startsecs = 30
stopasgroup=true
killasgroup=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[program:awx-rsyslogd]
command = rsyslogd -n -i /var/run/awx-rsyslog/rsyslog.pid -f /var/lib/awx/rsyslog/rsyslog.conf
autostart = true
autorestart = true
stopwaitsecs = 5
stopsignal=TERM
startsecs = 30
stopasgroup=true
killasgroup=true
redirect_stderr=true
@ -86,19 +82,28 @@ stderr_events_enabled = true
[program:awx-receptor]
command = receptor --config /etc/receptor/receptor.conf
autostart = true
autorestart = true
stopsignal = KILL
stopasgroup = true
killasgroup = true
redirect_stderr=true
stdout_events_enabled = true
stderr_events_enabled = true
startsecs = 30
stopasgroup=true
killasgroup=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[group:tower-processes]
programs=awx-dispatcher,awx-receiver,awx-uwsgi,awx-daphne,awx-nginx,awx-wsbroadcast,awx-rsyslogd
priority=5
[eventlistener:superwatcher]
command=stop-supervisor
events=PROCESS_STATE_FATAL
autorestart = true
stderr_logfile=/dev/stdout
stderr_logfile_maxbytes=0
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
[unix_http_server]
file=/var/run/supervisor/supervisor.sock