[rsyslog] Enable disk-assisted queuing on output (#14005)

Right now we only enable queuing on the rsyslog main_queue. This adds a
parameter to also enable it on the omhttp output action. As omhttp can
take time to process messages (e.g. blocking on the result of its HTTP
requests), this change allows for queuing messages up and hopefully
preventing some messages from getting lost when the log server is slow
to respond.

Signed-off-by: Rick Elrod <rick@elrod.me>
This commit is contained in:
Rick Elrod 2023-06-01 22:37:45 -05:00 committed by GitHub
parent b70fa88b78
commit 0ae720244c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 65 additions and 17 deletions

View File

@ -684,11 +684,28 @@ register(
field_class=fields.IntegerField,
default=1,
min_value=1,
label=_('Maximum disk persistance for external log aggregation (in GB)'),
label=_('Maximum disk persistence for external log aggregation (in GB)'),
help_text=_(
'Amount of data to store (in gigabytes) during an outage of '
'the external log aggregator (defaults to 1). '
'Equivalent to the rsyslogd queue.maxdiskspace setting.'
'Equivalent to the rsyslogd queue.maxdiskspace setting for main_queue. '
'Notably, this is used for the rsyslogd main queue (for input messages).'
),
category=_('Logging'),
category_slug='logging',
)
register(
'LOG_AGGREGATOR_ACTION_MAX_DISK_USAGE_GB',
field_class=fields.IntegerField,
default=1,
min_value=1,
label=_('Maximum disk persistence for rsyslogd action queuing (in GB)'),
help_text=_(
'Amount of data to store (in gigabytes) if an rsyslog action takes time '
'to process an incoming message (defaults to 1). '
'Equivalent to the rsyslogd queue.maxdiskspace setting on the action (e.g. omhttp). '
'Like LOG_AGGREGATOR_MAX_DISK_USAGE_GB, it stores files in the directory specified '
'by LOG_AGGREGATOR_MAX_DISK_USAGE_PATH.'
),
category=_('Logging'),
category_slug='logging',

View File

@ -47,7 +47,7 @@ data_loggly = {
'\n'.join(
[
'template(name="awx" type="string" string="%rawmsg-after-pri%")\nmodule(load="omhttp")',
'action(type="omhttp" server="logs-01.loggly.com" serverport="80" usehttps="off" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" errorfile="/var/log/tower/rsyslog.err" restpath="inputs/1fd38090-2af1-4e1e-8d80-492899da0f71/tag/http/")', # noqa
'action(type="omhttp" server="logs-01.loggly.com" serverport="80" usehttps="off" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" queue.spoolDirectory="/var/lib/awx" queue.filename="awx-external-logger-action-queue" queue.maxdiskspace="1g" queue.type="LinkedList" queue.saveOnShutdown="on" errorfile="/var/log/tower/rsyslog.err" restpath="inputs/1fd38090-2af1-4e1e-8d80-492899da0f71/tag/http/")', # noqa
]
),
),
@ -89,7 +89,7 @@ data_loggly = {
'\n'.join(
[
'template(name="awx" type="string" string="%rawmsg-after-pri%")\nmodule(load="omhttp")',
'action(type="omhttp" server="yoursplunk" serverport="443" usehttps="on" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" errorfile="/var/log/tower/rsyslog.err" restpath="services/collector/event")', # noqa
'action(type="omhttp" server="yoursplunk" serverport="443" usehttps="on" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" queue.spoolDirectory="/var/lib/awx" queue.filename="awx-external-logger-action-queue" queue.maxdiskspace="1g" queue.type="LinkedList" queue.saveOnShutdown="on" errorfile="/var/log/tower/rsyslog.err" restpath="services/collector/event")', # noqa
]
),
),
@ -103,7 +103,7 @@ data_loggly = {
'\n'.join(
[
'template(name="awx" type="string" string="%rawmsg-after-pri%")\nmodule(load="omhttp")',
'action(type="omhttp" server="yoursplunk" serverport="80" usehttps="off" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" errorfile="/var/log/tower/rsyslog.err" restpath="services/collector/event")', # noqa
'action(type="omhttp" server="yoursplunk" serverport="80" usehttps="off" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" queue.spoolDirectory="/var/lib/awx" queue.filename="awx-external-logger-action-queue" queue.maxdiskspace="1g" queue.type="LinkedList" queue.saveOnShutdown="on" errorfile="/var/log/tower/rsyslog.err" restpath="services/collector/event")', # noqa
]
),
),
@ -117,7 +117,7 @@ data_loggly = {
'\n'.join(
[
'template(name="awx" type="string" string="%rawmsg-after-pri%")\nmodule(load="omhttp")',
'action(type="omhttp" server="yoursplunk" serverport="8088" usehttps="on" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" errorfile="/var/log/tower/rsyslog.err" restpath="services/collector/event")', # noqa
'action(type="omhttp" server="yoursplunk" serverport="8088" usehttps="on" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" queue.spoolDirectory="/var/lib/awx" queue.filename="awx-external-logger-action-queue" queue.maxdiskspace="1g" queue.type="LinkedList" queue.saveOnShutdown="on" errorfile="/var/log/tower/rsyslog.err" restpath="services/collector/event")', # noqa
]
),
),
@ -131,7 +131,7 @@ data_loggly = {
'\n'.join(
[
'template(name="awx" type="string" string="%rawmsg-after-pri%")\nmodule(load="omhttp")',
'action(type="omhttp" server="yoursplunk" serverport="8088" usehttps="on" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" errorfile="/var/log/tower/rsyslog.err" restpath="services/collector/event")', # noqa
'action(type="omhttp" server="yoursplunk" serverport="8088" usehttps="on" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" queue.spoolDirectory="/var/lib/awx" queue.filename="awx-external-logger-action-queue" queue.maxdiskspace="1g" queue.type="LinkedList" queue.saveOnShutdown="on" errorfile="/var/log/tower/rsyslog.err" restpath="services/collector/event")', # noqa
]
),
),
@ -145,7 +145,7 @@ data_loggly = {
'\n'.join(
[
'template(name="awx" type="string" string="%rawmsg-after-pri%")\nmodule(load="omhttp")',
'action(type="omhttp" server="yoursplunk.org" serverport="8088" usehttps="on" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" errorfile="/var/log/tower/rsyslog.err" restpath="services/collector/event")', # noqa
'action(type="omhttp" server="yoursplunk.org" serverport="8088" usehttps="on" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" queue.spoolDirectory="/var/lib/awx" queue.filename="awx-external-logger-action-queue" queue.maxdiskspace="1g" queue.type="LinkedList" queue.saveOnShutdown="on" errorfile="/var/log/tower/rsyslog.err" restpath="services/collector/event")', # noqa
]
),
),
@ -159,7 +159,7 @@ data_loggly = {
'\n'.join(
[
'template(name="awx" type="string" string="%rawmsg-after-pri%")\nmodule(load="omhttp")',
'action(type="omhttp" server="yoursplunk.org" serverport="8088" usehttps="off" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" errorfile="/var/log/tower/rsyslog.err" restpath="services/collector/event")', # noqa
'action(type="omhttp" server="yoursplunk.org" serverport="8088" usehttps="off" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" queue.spoolDirectory="/var/lib/awx" queue.filename="awx-external-logger-action-queue" queue.maxdiskspace="1g" queue.type="LinkedList" queue.saveOnShutdown="on" errorfile="/var/log/tower/rsyslog.err" restpath="services/collector/event")', # noqa
]
),
),
@ -173,7 +173,7 @@ data_loggly = {
'\n'.join(
[
'template(name="awx" type="string" string="%rawmsg-after-pri%")\nmodule(load="omhttp")',
'action(type="omhttp" server="endpoint5.collection.us2.sumologic.com" serverport="443" usehttps="on" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" errorfile="/var/log/tower/rsyslog.err" restpath="receiver/v1/http/ZaVnC4dhaV0qoiETY0MrM3wwLoDgO1jFgjOxE6-39qokkj3LGtOroZ8wNaN2M6DtgYrJZsmSi4-36_Up5TbbN_8hosYonLKHSSOSKY845LuLZBCBwStrHQ==")', # noqa
'action(type="omhttp" server="endpoint5.collection.us2.sumologic.com" serverport="443" usehttps="on" allowunsignedcerts="off" skipverifyhost="off" action.resumeRetryCount="-1" template="awx" action.resumeInterval="5" queue.spoolDirectory="/var/lib/awx" queue.filename="awx-external-logger-action-queue" queue.maxdiskspace="1g" queue.type="LinkedList" queue.saveOnShutdown="on" errorfile="/var/log/tower/rsyslog.err" restpath="receiver/v1/http/ZaVnC4dhaV0qoiETY0MrM3wwLoDgO1jFgjOxE6-39qokkj3LGtOroZ8wNaN2M6DtgYrJZsmSi4-36_Up5TbbN_8hosYonLKHSSOSKY845LuLZBCBwStrHQ==")', # noqa
]
),
),

View File

@ -17,7 +17,8 @@ def construct_rsyslog_conf_template(settings=settings):
port = getattr(settings, 'LOG_AGGREGATOR_PORT', '')
protocol = getattr(settings, 'LOG_AGGREGATOR_PROTOCOL', '')
timeout = getattr(settings, 'LOG_AGGREGATOR_TCP_TIMEOUT', 5)
max_disk_space = getattr(settings, 'LOG_AGGREGATOR_MAX_DISK_USAGE_GB', 1)
max_disk_space_main_queue = getattr(settings, 'LOG_AGGREGATOR_MAX_DISK_USAGE_GB', 1)
max_disk_space_action_queue = getattr(settings, 'LOG_AGGREGATOR_ACTION_MAX_DISK_USAGE_GB', 1)
spool_directory = getattr(settings, 'LOG_AGGREGATOR_MAX_DISK_USAGE_PATH', '/var/lib/awx').rstrip('/')
error_log_file = getattr(settings, 'LOG_AGGREGATOR_RSYSLOGD_ERROR_LOG_FILE', '')
@ -32,7 +33,7 @@ def construct_rsyslog_conf_template(settings=settings):
'$WorkDirectory /var/lib/awx/rsyslog',
f'$MaxMessageSize {max_bytes}',
'$IncludeConfig /var/lib/awx/rsyslog/conf.d/*.conf',
f'main_queue(queue.spoolDirectory="{spool_directory}" queue.maxdiskspace="{max_disk_space}g" queue.type="Disk" queue.filename="awx-external-logger-backlog")', # noqa
f'main_queue(queue.spoolDirectory="{spool_directory}" queue.maxdiskspace="{max_disk_space_main_queue}g" queue.type="Disk" queue.filename="awx-external-logger-backlog")', # noqa
'module(load="imuxsock" SysSock.Use="off")',
'input(type="imuxsock" Socket="' + settings.LOGGING['handlers']['external_logger']['address'] + '" unlink="on" RateLimit.Burst="0")',
'template(name="awx" type="string" string="%rawmsg-after-pri%")',
@ -78,6 +79,11 @@ def construct_rsyslog_conf_template(settings=settings):
'action.resumeRetryCount="-1"',
'template="awx"',
f'action.resumeInterval="{timeout}"',
f'queue.spoolDirectory="{spool_directory}"',
'queue.filename="awx-external-logger-action-queue"',
f'queue.maxdiskspace="{max_disk_space_action_queue}g"',
'queue.type="LinkedList"',
'queue.saveOnShutdown="on"',
]
if error_log_file:
params.append(f'errorfile="{error_log_file}"')

View File

@ -790,7 +790,8 @@ LOG_AGGREGATOR_ENABLED = False
LOG_AGGREGATOR_TCP_TIMEOUT = 5
LOG_AGGREGATOR_VERIFY_CERT = True
LOG_AGGREGATOR_LEVEL = 'INFO'
LOG_AGGREGATOR_MAX_DISK_USAGE_GB = 1
LOG_AGGREGATOR_MAX_DISK_USAGE_GB = 1 # Main queue
LOG_AGGREGATOR_ACTION_MAX_DISK_USAGE_GB = 1 # Action queue
LOG_AGGREGATOR_MAX_DISK_USAGE_PATH = '/var/lib/awx'
LOG_AGGREGATOR_RSYSLOGD_DEBUG = False
LOG_AGGREGATOR_RSYSLOGD_ERROR_LOG_FILE = '/var/log/tower/rsyslog.err'

View File

@ -30,6 +30,7 @@ SettingsAPI.readCategory.mockResolvedValue({
LOG_AGGREGATOR_VERIFY_CERT: true,
LOG_AGGREGATOR_LEVEL: 'INFO',
LOG_AGGREGATOR_MAX_DISK_USAGE_GB: 1,
LOG_AGGREGATOR_ACTION_MAX_DISK_USAGE_GB: 1,
LOG_AGGREGATOR_MAX_DISK_USAGE_PATH: '/var/lib/awx',
LOG_AGGREGATOR_RSYSLOGD_DEBUG: false,
API_400_ERROR_LOG_FORMAT:

View File

@ -32,6 +32,7 @@ const mockSettings = {
LOG_AGGREGATOR_VERIFY_CERT: true,
LOG_AGGREGATOR_LEVEL: 'ERROR',
LOG_AGGREGATOR_MAX_DISK_USAGE_GB: 1,
LOG_AGGREGATOR_ACTION_MAX_DISK_USAGE_GB: 1,
LOG_AGGREGATOR_MAX_DISK_USAGE_PATH: '/var/lib/awx',
LOG_AGGREGATOR_RSYSLOGD_DEBUG: false,
API_400_ERROR_LOG_FORMAT:

View File

@ -594,8 +594,18 @@
"LOG_AGGREGATOR_MAX_DISK_USAGE_GB": {
"type": "integer",
"required": false,
"label": "Maximum disk persistance for external log aggregation (in GB)",
"help_text": "Amount of data to store (in gigabytes) during an outage of the external log aggregator (defaults to 1). Equivalent to the rsyslogd queue.maxdiskspace setting.",
"label": "Maximum disk persistence for external log aggregation (in GB)",
"help_text": "Amount of data to store (in gigabytes) during an outage of the external log aggregator (defaults to 1). Equivalent to the rsyslogd queue.maxdiskspace setting for main_queue. Notably, this is used for the rsyslogd main queue (for input messages).",
"min_value": 1,
"category": "Logging",
"category_slug": "logging",
"default": 1
},
"LOG_AGGREGATOR_ACTION_MAX_DISK_USAGE_GB": {
"type": "integer",
"required": false,
"label": "Maximum disk persistence for rsyslogd action queuing (in GB)",
"help_text": "Amount of data to store (in gigabytes) if an rsyslog action takes time to process an incoming message (defaults to 1). Equivalent to the rsyslogd queue.maxdiskspace setting on the action (e.g. omhttp). Like LOG_AGGREGATOR_MAX_DISK_USAGE_GB, it stores files in the directory specified by LOG_AGGREGATOR_MAX_DISK_USAGE_PATH.",
"min_value": 1,
"category": "Logging",
"category_slug": "logging",
@ -4453,13 +4463,23 @@
},
"LOG_AGGREGATOR_MAX_DISK_USAGE_GB": {
"type": "integer",
"label": "Maximum disk persistance for external log aggregation (in GB)",
"help_text": "Amount of data to store (in gigabytes) during an outage of the external log aggregator (defaults to 1). Equivalent to the rsyslogd queue.maxdiskspace setting.",
"label": "Maximum disk persistence for external log aggregation (in GB)",
"help_text": "Amount of data to store (in gigabytes) during an outage of the external log aggregator (defaults to 1). Equivalent to the rsyslogd queue.maxdiskspace setting for main_queue. Notably, this is used for the rsyslogd main queue (for input messages).",
"min_value": 1,
"category": "Logging",
"category_slug": "logging",
"defined_in_file": false
},
"LOG_AGGREGATOR_ACTION_MAX_DISK_USAGE_GB": {
"type": "integer",
"required": false,
"label": "Maximum disk persistence for rsyslogd action queuing (in GB)",
"help_text": "Amount of data to store (in gigabytes) if an rsyslog action takes time to process an incoming message (defaults to 1). Equivalent to the rsyslogd queue.maxdiskspace setting on the action (e.g. omhttp). Like LOG_AGGREGATOR_MAX_DISK_USAGE_GB, it stores files in the directory specified by LOG_AGGREGATOR_MAX_DISK_USAGE_PATH.",
"min_value": 1,
"category": "Logging",
"category_slug": "logging",
"default": 1
},
"LOG_AGGREGATOR_MAX_DISK_USAGE_PATH": {
"type": "string",
"label": "File system location for rsyslogd disk persistence",

View File

@ -78,6 +78,7 @@
"LOG_AGGREGATOR_VERIFY_CERT": true,
"LOG_AGGREGATOR_LEVEL": "INFO",
"LOG_AGGREGATOR_MAX_DISK_USAGE_GB": 1,
"LOG_AGGREGATOR_ACTION_MAX_DISK_USAGE_GB": 1,
"LOG_AGGREGATOR_MAX_DISK_USAGE_PATH": "/var/lib/awx",
"LOG_AGGREGATOR_RSYSLOGD_DEBUG": false,
"API_400_ERROR_LOG_FORMAT": "status {status_code} received by user {user_name} attempting to access {url_path} from {remote_addr}",

View File

@ -16,6 +16,7 @@
"LOG_AGGREGATOR_VERIFY_CERT": true,
"LOG_AGGREGATOR_LEVEL": "INFO",
"LOG_AGGREGATOR_MAX_DISK_USAGE_GB": 1,
"LOG_AGGREGATOR_ACTION_MAX_DISK_USAGE_GB": 1,
"LOG_AGGREGATOR_MAX_DISK_USAGE_PATH": "/var/lib/awx",
"LOG_AGGREGATOR_RSYSLOGD_DEBUG": false,
"API_400_ERROR_LOG_FORMAT": "Test Log Line"