awx/tools/scripts/rsyslog-4xx-recovery
Hao Liu 6e5e1c8fff Recover rsyslog from 4xx error
Due to https://github.com/ansible/awx/issues/7560

'omhttp' module for rsyslog will completely stop forwarding message to external log aggregator after receiving a 4xx error from the external log aggregator

This PR is an "workaround" for this problem by restarting rsyslogd after detecting that rsyslog received a 4xx error
2023-12-14 10:41:48 -05:00

55 lines
1.8 KiB
Plaintext
Executable File

#!/usr/bin/awx-python
import sys
import os
import signal
import datetime
from datetime import timezone
def write_stdout(s):
sys.stdout.write(s)
sys.stdout.flush()
def write_stderr(s):
sys.stderr.write(s)
sys.stderr.flush()
def main():
while 1:
write_stdout("READY\n")
# read header line and print it to stderr
line = sys.stdin.readline()
# parse header line and decide what to do with it
try:
headers = dict([x.split(":") for x in line.split()])
data = sys.stdin.read(int(headers["len"]))
except ValueError as e:
write_stderr(str(e))
# awx-rsyslog PROCESS_LOG_STDERR handler
if headers["eventname"] == "PROCESS_LOG_STDERR":
# pertinent data to process that produced PROCES_LOG_STDERR is in the first line of the data payload; so lets extract it
proc_details = dict([x.split(":") for x in (data.split("\n")[0]).split()])
if proc_details["processname"] == "awx-rsyslogd":
log_message = "".join(data.split("\n")[1:])
# look for a 4XX HTTP CODE in the log message. if found, issue a sigkill
if any(str(x) in log_message.split() for x in range(400, 420)):
try:
write_stderr(
f"{datetime.datetime.now(timezone.utc)} - sending SIGTERM to proc=[{proc_details['processname']}] with pid=[{int(proc_details['pid'])}] due to log_message=[{log_message}]\n"
)
os.kill(int(proc_details["pid"]), signal.SIGTERM)
except Exception as e:
write_stderr(str(e))
write_stdout("RESULT 2\nOK")
if __name__ == "__main__":
main()