Recover rsyslog from 4xx error

Due to https://github.com/ansible/awx/issues/7560

'omhttp' module for rsyslog will completely stop forwarding message to external log aggregator after receiving a 4xx error from the external log aggregator

This PR is an "workaround" for this problem by restarting rsyslogd after detecting that rsyslog received a 4xx error
This commit is contained in:
Hao Liu 2023-12-11 16:12:43 -05:00 committed by Hao Liu
parent bf42c63c12
commit 6e5e1c8fff
5 changed files with 25 additions and 23 deletions

View File

@ -22,7 +22,7 @@ recursive-exclude awx/settings local_settings.py*
include tools/scripts/request_tower_configuration.sh
include tools/scripts/request_tower_configuration.ps1
include tools/scripts/automation-controller-service
include tools/scripts/failure-event-handler
include tools/scripts/rsyslog-4xx-recovery
include tools/scripts/awx-python
include awx/playbooks/library/mkfifo.py
include tools/sosreport/*

View File

@ -226,15 +226,18 @@ RUN ln -sf /awx_devel/{{ template_dest }}/supervisor_rsyslog.conf /etc/superviso
ADD tools/ansible/roles/dockerfile/files/launch_awx_web.sh /usr/bin/launch_awx_web.sh
ADD tools/ansible/roles/dockerfile/files/launch_awx_task.sh /usr/bin/launch_awx_task.sh
ADD tools/ansible/roles/dockerfile/files/launch_awx_rsyslog.sh /usr/bin/launch_awx_rsyslog.sh
ADD tools/scripts/rsyslog-4xx-recovery /usr/bin/rsyslog-4xx-recovery
ADD {{ template_dest }}/supervisor_web.conf /etc/supervisord_web.conf
ADD {{ template_dest }}/supervisor_task.conf /etc/supervisord_task.conf
ADD {{ template_dest }}/supervisor_rsyslog.conf /etc/supervisord_rsyslog.conf
ADD tools/scripts/awx-python /usr/bin/awx-python
{% endif %}
{% if (build_dev|bool) or (kube_dev|bool) %}
RUN echo /awx_devel > /var/lib/awx/venv/awx/lib/python3.9/site-packages/awx.egg-link
ADD tools/docker-compose/awx-manage /usr/local/bin/awx-manage
ADD tools/scripts/awx-python /usr/bin/awx-python
RUN ln -sf /awx_devel/tools/scripts/awx-python /usr/bin/awx-python
RUN ln -sf /awx_devel/tools/scripts/rsyslog-4xx-recovery /usr/bin/rsyslog-4xx-recovery
{% endif %}
# Pre-create things we need to access

View File

@ -8,7 +8,7 @@ pidfile = /var/run/supervisor/supervisor.rsyslog.pid
[program:awx-rsyslogd]
command = rsyslogd -n -i /var/run/awx-rsyslog/rsyslog.pid -f /var/lib/awx/rsyslog/rsyslog.conf
autorestart = true
startsecs = 30
startsecs = 0
stopasgroup=true
killasgroup=true
stdout_logfile=/dev/stdout
@ -59,6 +59,15 @@ stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr
stderr_logfile_maxbytes=0
[eventlistener:rsyslog-4xx-recovery]
command=rsyslog-4xx-recovery
buffer_size = 100
events=PROCESS_LOG_STDERR
priority=0
autorestart=true
stdout_events_enabled = true
stderr_events_enabled = true
[unix_http_server]
file=/var/run/supervisor/supervisor.rsyslog.sock

View File

@ -86,9 +86,9 @@ stderr_events_enabled = true
[program:awx-rsyslogd]
command = rsyslogd -n -i /var/run/awx-rsyslog/rsyslog.pid -f /var/lib/awx/rsyslog/rsyslog.conf
autorestart = true
startsecs=0
stopasgroup=true
killasgroup=true
redirect_stderr=true
stdout_events_enabled = true
stderr_events_enabled = true
@ -113,6 +113,15 @@ killasgroup=true
stdout_events_enabled = true
stderr_events_enabled = true
[eventlistener:rsyslog-4xx-recovery]
command=/awx_devel/tools/scripts/rsyslog-4xx-recovery
buffer_size = 100
events=PROCESS_LOG_STDERR
priority=0
autorestart=true
stdout_events_enabled = true
stderr_events_enabled = true
[unix_http_server]
file=/var/run/supervisor/supervisor.sock

View File

@ -11,12 +11,10 @@ def write_stdout(s):
sys.stdout.write(s)
sys.stdout.flush()
def write_stderr(s):
sys.stderr.write(s)
sys.stderr.flush()
def main():
while 1:
write_stdout("READY\n")
@ -31,23 +29,6 @@ def main():
except ValueError as e:
write_stderr(str(e))
# now decide what do to based on eventnames
if headers["eventname"] == "PROCESS_STATE_FATAL":
headers.update(
dict(
[x.split(":") for x in sys.stdin.read(int(headers["len"])).split()]
)
)
try:
# incoming event that produced PROCESS_STATE_FATAL will have a PID. SIGTERM it!
write_stderr(
f"{datetime.datetime.now(timezone.utc)} - sending SIGTERM to proc={headers} with data={headers}\n"
)
os.kill(headers["pid"], signal.SIGTERM)
except Exception as e:
write_stderr(str(e))
# awx-rsyslog PROCESS_LOG_STDERR handler
if headers["eventname"] == "PROCESS_LOG_STDERR":
# pertinent data to process that produced PROCES_LOG_STDERR is in the first line of the data payload; so lets extract it