mirror of
https://github.com/ansible/awx.git
synced 2026-01-10 15:32:07 -03:30
Integrate scheduler into dispatcher main loop (#14067)
Dispatcher refactoring to get pg_notify publish payload as separate method Refactor periodic module under dispatcher entirely Use real numbers for schedule reference time Run based on due_to_run method Review comments about naming and code comments
This commit is contained in:
parent
14992cee17
commit
284bd8377a
@ -40,8 +40,12 @@ def get_task_queuename():
|
||||
|
||||
|
||||
class PubSub(object):
|
||||
def __init__(self, conn):
|
||||
def __init__(self, conn, select_timeout=None):
|
||||
self.conn = conn
|
||||
if select_timeout is None:
|
||||
self.select_timeout = 5
|
||||
else:
|
||||
self.select_timeout = select_timeout
|
||||
|
||||
def listen(self, channel):
|
||||
with self.conn.cursor() as cur:
|
||||
@ -72,12 +76,12 @@ class PubSub(object):
|
||||
n = psycopg.connection.Notify(pgn.relname.decode(enc), pgn.extra.decode(enc), pgn.be_pid)
|
||||
yield n
|
||||
|
||||
def events(self, select_timeout=5, yield_timeouts=False):
|
||||
def events(self, yield_timeouts=False):
|
||||
if not self.conn.autocommit:
|
||||
raise RuntimeError('Listening for events can only be done in autocommit mode')
|
||||
|
||||
while True:
|
||||
if select.select([self.conn], [], [], select_timeout) == NOT_READY:
|
||||
if select.select([self.conn], [], [], self.select_timeout) == NOT_READY:
|
||||
if yield_timeouts:
|
||||
yield None
|
||||
else:
|
||||
@ -90,7 +94,7 @@ class PubSub(object):
|
||||
|
||||
|
||||
@contextmanager
|
||||
def pg_bus_conn(new_connection=False):
|
||||
def pg_bus_conn(new_connection=False, select_timeout=None):
|
||||
'''
|
||||
Any listeners probably want to establish a new database connection,
|
||||
separate from the Django connection used for queries, because that will prevent
|
||||
@ -115,7 +119,7 @@ def pg_bus_conn(new_connection=False):
|
||||
raise RuntimeError('Unexpectedly could not connect to postgres for pg_notify actions')
|
||||
conn = pg_connection.connection
|
||||
|
||||
pubsub = PubSub(conn)
|
||||
pubsub = PubSub(conn, select_timeout=select_timeout)
|
||||
yield pubsub
|
||||
if new_connection:
|
||||
conn.close()
|
||||
|
||||
@ -40,6 +40,9 @@ class Control(object):
|
||||
def cancel(self, task_ids, *args, **kwargs):
|
||||
return self.control_with_reply('cancel', *args, extra_data={'task_ids': task_ids}, **kwargs)
|
||||
|
||||
def schedule(self, *args, **kwargs):
|
||||
return self.control_with_reply('schedule', *args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def generate_reply_queue_name(cls):
|
||||
return f"reply_to_{str(uuid.uuid4()).replace('-','_')}"
|
||||
@ -52,14 +55,14 @@ class Control(object):
|
||||
if not connection.get_autocommit():
|
||||
raise RuntimeError('Control-with-reply messages can only be done in autocommit mode')
|
||||
|
||||
with pg_bus_conn() as conn:
|
||||
with pg_bus_conn(select_timeout=timeout) as conn:
|
||||
conn.listen(reply_queue)
|
||||
send_data = {'control': command, 'reply_to': reply_queue}
|
||||
if extra_data:
|
||||
send_data.update(extra_data)
|
||||
conn.notify(self.queuename, json.dumps(send_data))
|
||||
|
||||
for reply in conn.events(select_timeout=timeout, yield_timeouts=True):
|
||||
for reply in conn.events(yield_timeouts=True):
|
||||
if reply is None:
|
||||
logger.error(f'{self.service} did not reply within {timeout}s')
|
||||
raise RuntimeError(f"{self.service} did not reply within {timeout}s")
|
||||
|
||||
@ -1,57 +1,142 @@
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from multiprocessing import Process
|
||||
import yaml
|
||||
from datetime import datetime
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import connections
|
||||
from schedule import Scheduler
|
||||
from django_guid import set_guid
|
||||
from django_guid.utils import generate_guid
|
||||
|
||||
from awx.main.dispatch.worker import TaskWorker
|
||||
from awx.main.utils.db import set_connection_name
|
||||
|
||||
logger = logging.getLogger('awx.main.dispatch.periodic')
|
||||
|
||||
|
||||
class Scheduler(Scheduler):
|
||||
def run_continuously(self):
|
||||
idle_seconds = max(1, min(self.jobs).period.total_seconds() / 2)
|
||||
class ScheduledTask:
|
||||
"""
|
||||
Class representing schedules, very loosely modeled after python schedule library Job
|
||||
the idea of this class is to:
|
||||
- only deal in relative times (time since the scheduler global start)
|
||||
- only deal in integer math for target runtimes, but float for current relative time
|
||||
|
||||
def run():
|
||||
ppid = os.getppid()
|
||||
logger.warning('periodic beat started')
|
||||
Missed schedule policy:
|
||||
Invariant target times are maintained, meaning that if interval=10s offset=0
|
||||
and it runs at t=7s, then it calls for next run in 3s.
|
||||
However, if a complete interval has passed, that is counted as a missed run,
|
||||
and missed runs are abandoned (no catch-up runs).
|
||||
"""
|
||||
|
||||
set_connection_name('periodic') # set application_name to distinguish from other dispatcher processes
|
||||
def __init__(self, name: str, data: dict):
|
||||
# parameters need for schedule computation
|
||||
self.interval = int(data['schedule'].total_seconds())
|
||||
self.offset = 0 # offset relative to start time this schedule begins
|
||||
self.index = 0 # number of periods of the schedule that has passed
|
||||
|
||||
while True:
|
||||
if os.getppid() != ppid:
|
||||
# if the parent PID changes, this process has been orphaned
|
||||
# via e.g., segfault or sigkill, we should exit too
|
||||
pid = os.getpid()
|
||||
logger.warning(f'periodic beat exiting gracefully pid:{pid}')
|
||||
raise SystemExit()
|
||||
try:
|
||||
for conn in connections.all():
|
||||
# If the database connection has a hiccup, re-establish a new
|
||||
# connection
|
||||
conn.close_if_unusable_or_obsolete()
|
||||
set_guid(generate_guid())
|
||||
self.run_pending()
|
||||
except Exception:
|
||||
logger.exception('encountered an error while scheduling periodic tasks')
|
||||
time.sleep(idle_seconds)
|
||||
# parameters that do not affect scheduling logic
|
||||
self.last_run = None # time of last run, only used for debug
|
||||
self.completed_runs = 0 # number of times schedule is known to run
|
||||
self.name = name
|
||||
self.data = data # used by caller to know what to run
|
||||
|
||||
process = Process(target=run)
|
||||
process.daemon = True
|
||||
process.start()
|
||||
@property
|
||||
def next_run(self):
|
||||
"Time until the next run with t=0 being the global_start of the scheduler class"
|
||||
return (self.index + 1) * self.interval + self.offset
|
||||
|
||||
def due_to_run(self, relative_time):
|
||||
return bool(self.next_run <= relative_time)
|
||||
|
||||
def expected_runs(self, relative_time):
|
||||
return int((relative_time - self.offset) / self.interval)
|
||||
|
||||
def mark_run(self, relative_time):
|
||||
self.last_run = relative_time
|
||||
self.completed_runs += 1
|
||||
new_index = self.expected_runs(relative_time)
|
||||
if new_index > self.index + 1:
|
||||
logger.warning(f'Missed {new_index - self.index - 1} schedules of {self.name}')
|
||||
self.index = new_index
|
||||
|
||||
def missed_runs(self, relative_time):
|
||||
"Number of times job was supposed to ran but failed to, only used for debug"
|
||||
missed_ct = self.expected_runs(relative_time) - self.completed_runs
|
||||
# if this is currently due to run do not count that as a missed run
|
||||
if missed_ct and self.due_to_run(relative_time):
|
||||
missed_ct -= 1
|
||||
return missed_ct
|
||||
|
||||
|
||||
def run_continuously():
|
||||
scheduler = Scheduler()
|
||||
for task in settings.CELERYBEAT_SCHEDULE.values():
|
||||
apply_async = TaskWorker.resolve_callable(task['task']).apply_async
|
||||
total_seconds = task['schedule'].total_seconds()
|
||||
scheduler.every(total_seconds).seconds.do(apply_async)
|
||||
scheduler.run_continuously()
|
||||
class Scheduler:
|
||||
def __init__(self, schedule):
|
||||
"""
|
||||
Expects schedule in the form of a dictionary like
|
||||
{
|
||||
'job1': {'schedule': timedelta(seconds=50), 'other': 'stuff'}
|
||||
}
|
||||
Only the schedule nearest-second value is used for scheduling,
|
||||
the rest of the data is for use by the caller to know what to run.
|
||||
"""
|
||||
self.jobs = [ScheduledTask(name, data) for name, data in schedule.items()]
|
||||
min_interval = min(job.interval for job in self.jobs)
|
||||
num_jobs = len(self.jobs)
|
||||
|
||||
# this is intentionally oppioniated against spammy schedules
|
||||
# a core goal is to spread out the scheduled tasks (for worker management)
|
||||
# and high-frequency schedules just do not work with that
|
||||
if num_jobs > min_interval:
|
||||
raise RuntimeError(f'Number of schedules ({num_jobs}) is more than the shortest schedule interval ({min_interval} seconds).')
|
||||
|
||||
# even space out jobs over the base interval
|
||||
for i, job in enumerate(self.jobs):
|
||||
job.offset = (i * min_interval) // num_jobs
|
||||
|
||||
# internally times are all referenced relative to startup time, add grace period
|
||||
self.global_start = time.time() + 2.0
|
||||
|
||||
def get_and_mark_pending(self):
|
||||
relative_time = time.time() - self.global_start
|
||||
to_run = []
|
||||
for job in self.jobs:
|
||||
if job.due_to_run(relative_time):
|
||||
to_run.append(job)
|
||||
logger.debug(f'scheduler found {job.name} to run, {relative_time - job.next_run} seconds after target')
|
||||
job.mark_run(relative_time)
|
||||
return to_run
|
||||
|
||||
def time_until_next_run(self):
|
||||
relative_time = time.time() - self.global_start
|
||||
next_job = min(self.jobs, key=lambda j: j.next_run)
|
||||
delta = next_job.next_run - relative_time
|
||||
if delta <= 0.1:
|
||||
# careful not to give 0 or negative values to the select timeout, which has unclear interpretation
|
||||
logger.warning(f'Scheduler next run of {next_job.name} is {-delta} seconds in the past')
|
||||
return 0.1
|
||||
elif delta > 20.0:
|
||||
logger.warning(f'Scheduler next run unexpectedly over 20 seconds in future: {delta}')
|
||||
return 20.0
|
||||
logger.debug(f'Scheduler next run is {next_job.name} in {delta} seconds')
|
||||
return delta
|
||||
|
||||
def debug(self, *args, **kwargs):
|
||||
data = dict()
|
||||
data['title'] = 'Scheduler status'
|
||||
|
||||
now = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S UTC')
|
||||
start_time = datetime.fromtimestamp(self.global_start).strftime('%Y-%m-%d %H:%M:%S UTC')
|
||||
relative_time = time.time() - self.global_start
|
||||
data['started_time'] = start_time
|
||||
data['current_time'] = now
|
||||
data['current_time_relative'] = round(relative_time, 3)
|
||||
data['total_schedules'] = len(self.jobs)
|
||||
|
||||
data['schedule_list'] = dict(
|
||||
[
|
||||
(
|
||||
job.name,
|
||||
dict(
|
||||
last_run_seconds_ago=round(relative_time - job.last_run, 3) if job.last_run else None,
|
||||
next_run_in_seconds=round(job.next_run - relative_time, 3),
|
||||
offset_in_seconds=job.offset,
|
||||
completed_runs=job.completed_runs,
|
||||
missed_runs=job.missed_runs(relative_time),
|
||||
),
|
||||
)
|
||||
for job in sorted(self.jobs, key=lambda job: job.interval)
|
||||
]
|
||||
)
|
||||
return yaml.safe_dump(data, default_flow_style=False, sort_keys=False)
|
||||
|
||||
@ -73,15 +73,15 @@ class task:
|
||||
return cls.apply_async(args, kwargs)
|
||||
|
||||
@classmethod
|
||||
def apply_async(cls, args=None, kwargs=None, queue=None, uuid=None, **kw):
|
||||
def get_async_body(cls, args=None, kwargs=None, uuid=None, **kw):
|
||||
"""
|
||||
Get the python dict to become JSON data in the pg_notify message
|
||||
This same message gets passed over the dispatcher IPC queue to workers
|
||||
If a task is submitted to a multiprocessing pool, skipping pg_notify, this might be used directly
|
||||
"""
|
||||
task_id = uuid or str(uuid4())
|
||||
args = args or []
|
||||
kwargs = kwargs or {}
|
||||
queue = queue or getattr(cls.queue, 'im_func', cls.queue)
|
||||
if not queue:
|
||||
msg = f'{cls.name}: Queue value required and may not be None'
|
||||
logger.error(msg)
|
||||
raise ValueError(msg)
|
||||
obj = {'uuid': task_id, 'args': args, 'kwargs': kwargs, 'task': cls.name, 'time_pub': time.time()}
|
||||
guid = get_guid()
|
||||
if guid:
|
||||
@ -89,6 +89,16 @@ class task:
|
||||
if bind_kwargs:
|
||||
obj['bind_kwargs'] = bind_kwargs
|
||||
obj.update(**kw)
|
||||
return obj
|
||||
|
||||
@classmethod
|
||||
def apply_async(cls, args=None, kwargs=None, queue=None, uuid=None, **kw):
|
||||
queue = queue or getattr(cls.queue, 'im_func', cls.queue)
|
||||
if not queue:
|
||||
msg = f'{cls.name}: Queue value required and may not be None'
|
||||
logger.error(msg)
|
||||
raise ValueError(msg)
|
||||
obj = cls.get_async_body(args=args, kwargs=kwargs, uuid=uuid, **kw)
|
||||
if callable(queue):
|
||||
queue = queue()
|
||||
if not is_testing():
|
||||
@ -116,4 +126,5 @@ class task:
|
||||
setattr(fn, 'name', cls.name)
|
||||
setattr(fn, 'apply_async', cls.apply_async)
|
||||
setattr(fn, 'delay', cls.delay)
|
||||
setattr(fn, 'get_async_body', cls.get_async_body)
|
||||
return fn
|
||||
|
||||
@ -11,11 +11,13 @@ import psycopg
|
||||
import time
|
||||
from uuid import UUID
|
||||
from queue import Empty as QueueEmpty
|
||||
from datetime import timedelta
|
||||
|
||||
from django import db
|
||||
from django.conf import settings
|
||||
|
||||
from awx.main.dispatch.pool import WorkerPool
|
||||
from awx.main.dispatch.periodic import Scheduler
|
||||
from awx.main.dispatch import pg_bus_conn
|
||||
from awx.main.utils.common import log_excess_runtime
|
||||
from awx.main.utils.db import set_connection_name
|
||||
@ -64,10 +66,12 @@ class AWXConsumerBase(object):
|
||||
def control(self, body):
|
||||
logger.warning(f'Received control signal:\n{body}')
|
||||
control = body.get('control')
|
||||
if control in ('status', 'running', 'cancel'):
|
||||
if control in ('status', 'schedule', 'running', 'cancel'):
|
||||
reply_queue = body['reply_to']
|
||||
if control == 'status':
|
||||
msg = '\n'.join([self.listening_on, self.pool.debug()])
|
||||
if control == 'schedule':
|
||||
msg = self.scheduler.debug()
|
||||
elif control == 'running':
|
||||
msg = []
|
||||
for worker in self.pool.workers:
|
||||
@ -93,16 +97,11 @@ class AWXConsumerBase(object):
|
||||
else:
|
||||
logger.error('unrecognized control message: {}'.format(control))
|
||||
|
||||
def process_task(self, body):
|
||||
def dispatch_task(self, body):
|
||||
"""This will place the given body into a worker queue to run method decorated as a task"""
|
||||
if isinstance(body, dict):
|
||||
body['time_ack'] = time.time()
|
||||
|
||||
if 'control' in body:
|
||||
try:
|
||||
return self.control(body)
|
||||
except Exception:
|
||||
logger.exception(f"Exception handling control message: {body}")
|
||||
return
|
||||
if len(self.pool):
|
||||
if "uuid" in body and body['uuid']:
|
||||
try:
|
||||
@ -116,6 +115,16 @@ class AWXConsumerBase(object):
|
||||
self.pool.write(queue, body)
|
||||
self.total_messages += 1
|
||||
|
||||
def process_task(self, body):
|
||||
"""Routes the task details in body as either a control task or a task-task"""
|
||||
if 'control' in body:
|
||||
try:
|
||||
return self.control(body)
|
||||
except Exception:
|
||||
logger.exception(f"Exception handling control message: {body}")
|
||||
return
|
||||
self.dispatch_task(body)
|
||||
|
||||
@log_excess_runtime(logger)
|
||||
def record_statistics(self):
|
||||
if time.time() - self.last_stats > 1: # buffer stat recording to once per second
|
||||
@ -150,7 +159,7 @@ class AWXConsumerRedis(AWXConsumerBase):
|
||||
|
||||
|
||||
class AWXConsumerPG(AWXConsumerBase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
def __init__(self, *args, schedule=None, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.pg_max_wait = settings.DISPATCHER_DB_DOWNTOWN_TOLLERANCE
|
||||
# if no successful loops have ran since startup, then we should fail right away
|
||||
@ -161,27 +170,53 @@ class AWXConsumerPG(AWXConsumerBase):
|
||||
self.subsystem_metrics = s_metrics.Metrics(auto_pipe_execute=False)
|
||||
self.last_metrics_gather = init_time
|
||||
self.listen_cumulative_time = 0.0
|
||||
if schedule:
|
||||
schedule = schedule.copy()
|
||||
else:
|
||||
schedule = {}
|
||||
# add control tasks to be ran at regular schedules
|
||||
# NOTE: if we run out of database connections, it is important to still run cleanup
|
||||
# so that we scale down workers and free up connections
|
||||
schedule['pool_cleanup'] = {'control': self.pool.cleanup, 'schedule': timedelta(seconds=60)}
|
||||
# record subsystem metrics for the dispatcher
|
||||
schedule['metrics_gather'] = {'control': self.record_metrics, 'schedule': timedelta(seconds=20)}
|
||||
self.scheduler = Scheduler(schedule)
|
||||
|
||||
def record_metrics(self):
|
||||
current_time = time.time()
|
||||
self.pool.produce_subsystem_metrics(self.subsystem_metrics)
|
||||
self.subsystem_metrics.set('dispatcher_availability', self.listen_cumulative_time / (current_time - self.last_metrics_gather))
|
||||
self.subsystem_metrics.pipe_execute()
|
||||
self.listen_cumulative_time = 0.0
|
||||
self.last_metrics_gather = current_time
|
||||
|
||||
def run_periodic_tasks(self):
|
||||
self.record_statistics() # maintains time buffer in method
|
||||
"""
|
||||
Run general periodic logic, and return maximum time in seconds before
|
||||
the next requested run
|
||||
This may be called more often than that when events are consumed
|
||||
so this should be very efficient in that
|
||||
"""
|
||||
try:
|
||||
self.record_statistics() # maintains time buffer in method
|
||||
except Exception as exc:
|
||||
logger.warning(f'Failed to save dispatcher statistics {exc}')
|
||||
|
||||
current_time = time.time()
|
||||
if current_time - self.last_cleanup > 60: # same as cluster_node_heartbeat
|
||||
# NOTE: if we run out of database connections, it is important to still run cleanup
|
||||
# so that we scale down workers and free up connections
|
||||
self.pool.cleanup()
|
||||
self.last_cleanup = current_time
|
||||
for job in self.scheduler.get_and_mark_pending():
|
||||
if 'control' in job.data:
|
||||
try:
|
||||
job.data['control']()
|
||||
except Exception:
|
||||
logger.exception(f'Error running control task {job.data}')
|
||||
elif 'task' in job.data:
|
||||
body = self.worker.resolve_callable(job.data['task']).get_async_body()
|
||||
# bypasses pg_notify for scheduled tasks
|
||||
self.dispatch_task(body)
|
||||
|
||||
# record subsystem metrics for the dispatcher
|
||||
if current_time - self.last_metrics_gather > 20:
|
||||
try:
|
||||
self.pool.produce_subsystem_metrics(self.subsystem_metrics)
|
||||
self.subsystem_metrics.set('dispatcher_availability', self.listen_cumulative_time / (current_time - self.last_metrics_gather))
|
||||
self.subsystem_metrics.pipe_execute()
|
||||
except Exception:
|
||||
logger.exception(f"encountered an error trying to store {self.name} metrics")
|
||||
self.listen_cumulative_time = 0.0
|
||||
self.last_metrics_gather = current_time
|
||||
self.pg_is_down = False
|
||||
self.listen_start = time.time()
|
||||
|
||||
return self.scheduler.time_until_next_run()
|
||||
|
||||
def run(self, *args, **kwargs):
|
||||
super(AWXConsumerPG, self).run(*args, **kwargs)
|
||||
@ -197,14 +232,15 @@ class AWXConsumerPG(AWXConsumerBase):
|
||||
if init is False:
|
||||
self.worker.on_start()
|
||||
init = True
|
||||
self.listen_start = time.time()
|
||||
# run_periodic_tasks run scheduled actions and gives time until next scheduled action
|
||||
# this is saved to the conn (PubSub) object in order to modify read timeout in-loop
|
||||
conn.select_timeout = self.run_periodic_tasks()
|
||||
# this is the main operational loop for awx-manage run_dispatcher
|
||||
for e in conn.events(yield_timeouts=True):
|
||||
self.listen_cumulative_time += time.time() - self.listen_start
|
||||
self.listen_cumulative_time += time.time() - self.listen_start # for metrics
|
||||
if e is not None:
|
||||
self.process_task(json.loads(e.payload))
|
||||
self.run_periodic_tasks()
|
||||
self.pg_is_down = False
|
||||
self.listen_start = time.time()
|
||||
conn.select_timeout = self.run_periodic_tasks()
|
||||
if self.should_stop:
|
||||
return
|
||||
except psycopg.InterfaceError:
|
||||
|
||||
@ -3,15 +3,13 @@
|
||||
import logging
|
||||
import yaml
|
||||
|
||||
from django.core.cache import cache as django_cache
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db import connection as django_connection
|
||||
|
||||
from awx.main.dispatch import get_task_queuename
|
||||
from awx.main.dispatch.control import Control
|
||||
from awx.main.dispatch.pool import AutoscalePool
|
||||
from awx.main.dispatch.worker import AWXConsumerPG, TaskWorker
|
||||
from awx.main.dispatch import periodic
|
||||
|
||||
logger = logging.getLogger('awx.main.dispatch')
|
||||
|
||||
@ -21,6 +19,7 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--status', dest='status', action='store_true', help='print the internal state of any running dispatchers')
|
||||
parser.add_argument('--schedule', dest='schedule', action='store_true', help='print the current status of schedules being ran by dispatcher')
|
||||
parser.add_argument('--running', dest='running', action='store_true', help='print the UUIDs of any tasked managed by this dispatcher')
|
||||
parser.add_argument(
|
||||
'--reload',
|
||||
@ -42,6 +41,9 @@ class Command(BaseCommand):
|
||||
if options.get('status'):
|
||||
print(Control('dispatcher').status())
|
||||
return
|
||||
if options.get('schedule'):
|
||||
print(Control('dispatcher').schedule())
|
||||
return
|
||||
if options.get('running'):
|
||||
print(Control('dispatcher').running())
|
||||
return
|
||||
@ -58,21 +60,11 @@ class Command(BaseCommand):
|
||||
print(Control('dispatcher').cancel(cancel_data))
|
||||
return
|
||||
|
||||
# It's important to close these because we're _about_ to fork, and we
|
||||
# don't want the forked processes to inherit the open sockets
|
||||
# for the DB and cache connections (that way lies race conditions)
|
||||
django_connection.close()
|
||||
django_cache.close()
|
||||
|
||||
# spawn a daemon thread to periodically enqueues scheduled tasks
|
||||
# (like the node heartbeat)
|
||||
periodic.run_continuously()
|
||||
|
||||
consumer = None
|
||||
|
||||
try:
|
||||
queues = ['tower_broadcast_all', 'tower_settings_change', get_task_queuename()]
|
||||
consumer = AWXConsumerPG('dispatcher', TaskWorker(), queues, AutoscalePool(min_workers=4))
|
||||
consumer = AWXConsumerPG('dispatcher', TaskWorker(), queues, AutoscalePool(min_workers=4), schedule=settings.CELERYBEAT_SCHEDULE)
|
||||
consumer.run()
|
||||
except KeyboardInterrupt:
|
||||
logger.debug('Terminating Task Dispatcher')
|
||||
|
||||
@ -3,6 +3,7 @@ import multiprocessing
|
||||
import random
|
||||
import signal
|
||||
import time
|
||||
import yaml
|
||||
from unittest import mock
|
||||
|
||||
from django.utils.timezone import now as tz_now
|
||||
@ -13,6 +14,7 @@ from awx.main.dispatch import reaper
|
||||
from awx.main.dispatch.pool import StatefulPoolWorker, WorkerPool, AutoscalePool
|
||||
from awx.main.dispatch.publish import task
|
||||
from awx.main.dispatch.worker import BaseWorker, TaskWorker
|
||||
from awx.main.dispatch.periodic import Scheduler
|
||||
|
||||
|
||||
'''
|
||||
@ -439,3 +441,76 @@ class TestJobReaper(object):
|
||||
assert job.started > ref_time
|
||||
assert job.status == 'running'
|
||||
assert job.job_explanation == ''
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestScheduler:
|
||||
def test_too_many_schedules_freak_out(self):
|
||||
with pytest.raises(RuntimeError):
|
||||
Scheduler({'job1': {'schedule': datetime.timedelta(seconds=1)}, 'job2': {'schedule': datetime.timedelta(seconds=1)}})
|
||||
|
||||
def test_spread_out(self):
|
||||
scheduler = Scheduler(
|
||||
{
|
||||
'job1': {'schedule': datetime.timedelta(seconds=16)},
|
||||
'job2': {'schedule': datetime.timedelta(seconds=16)},
|
||||
'job3': {'schedule': datetime.timedelta(seconds=16)},
|
||||
'job4': {'schedule': datetime.timedelta(seconds=16)},
|
||||
}
|
||||
)
|
||||
assert [job.offset for job in scheduler.jobs] == [0, 4, 8, 12]
|
||||
|
||||
def test_missed_schedule(self, mocker):
|
||||
scheduler = Scheduler({'job1': {'schedule': datetime.timedelta(seconds=10)}})
|
||||
assert scheduler.jobs[0].missed_runs(time.time() - scheduler.global_start) == 0
|
||||
mocker.patch('awx.main.dispatch.periodic.time.time', return_value=scheduler.global_start + 50)
|
||||
scheduler.get_and_mark_pending()
|
||||
assert scheduler.jobs[0].missed_runs(50) > 1
|
||||
|
||||
def test_advance_schedule(self, mocker):
|
||||
scheduler = Scheduler(
|
||||
{
|
||||
'job1': {'schedule': datetime.timedelta(seconds=30)},
|
||||
'joba': {'schedule': datetime.timedelta(seconds=20)},
|
||||
'jobb': {'schedule': datetime.timedelta(seconds=20)},
|
||||
}
|
||||
)
|
||||
for job in scheduler.jobs:
|
||||
# HACK: the offsets automatically added make this a hard test to write... so remove offsets
|
||||
job.offset = 0.0
|
||||
mocker.patch('awx.main.dispatch.periodic.time.time', return_value=scheduler.global_start + 29)
|
||||
to_run = scheduler.get_and_mark_pending()
|
||||
assert set(job.name for job in to_run) == set(['joba', 'jobb'])
|
||||
mocker.patch('awx.main.dispatch.periodic.time.time', return_value=scheduler.global_start + 39)
|
||||
to_run = scheduler.get_and_mark_pending()
|
||||
assert len(to_run) == 1
|
||||
assert to_run[0].name == 'job1'
|
||||
|
||||
@staticmethod
|
||||
def get_job(scheduler, name):
|
||||
for job in scheduler.jobs:
|
||||
if job.name == name:
|
||||
return job
|
||||
|
||||
def test_scheduler_debug(self, mocker):
|
||||
scheduler = Scheduler(
|
||||
{
|
||||
'joba': {'schedule': datetime.timedelta(seconds=20)},
|
||||
'jobb': {'schedule': datetime.timedelta(seconds=50)},
|
||||
'jobc': {'schedule': datetime.timedelta(seconds=500)},
|
||||
'jobd': {'schedule': datetime.timedelta(seconds=20)},
|
||||
}
|
||||
)
|
||||
rel_time = 119.9 # slightly under the 6th 20-second bin, to avoid offset problems
|
||||
current_time = scheduler.global_start + rel_time
|
||||
mocker.patch('awx.main.dispatch.periodic.time.time', return_value=current_time - 1.0e-8)
|
||||
self.get_job(scheduler, 'jobb').mark_run(rel_time)
|
||||
self.get_job(scheduler, 'jobd').mark_run(rel_time - 20.0)
|
||||
|
||||
output = scheduler.debug()
|
||||
data = yaml.safe_load(output)
|
||||
assert data['schedule_list']['jobc']['last_run_seconds_ago'] is None
|
||||
assert data['schedule_list']['joba']['missed_runs'] == 4
|
||||
assert data['schedule_list']['jobd']['missed_runs'] == 3
|
||||
assert data['schedule_list']['jobd']['completed_runs'] == 1
|
||||
assert data['schedule_list']['jobb']['next_run_in_seconds'] > 25.0
|
||||
|
||||
@ -1,21 +0,0 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2013 Daniel Bader (http://dbader.org)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
@ -46,7 +46,6 @@ python-tss-sdk==1.0.0
|
||||
python-ldap
|
||||
pyyaml>=6.0.1
|
||||
receptorctl==1.3.0
|
||||
schedule==0.6.0
|
||||
social-auth-core[openidconnect]==4.3.0 # see UPGRADE BLOCKERs
|
||||
social-auth-app-django==5.0.0 # see UPGRADE BLOCKERs
|
||||
sqlparse >= 0.4.4 # Required by django https://github.com/ansible/awx/security/dependabot/96
|
||||
|
||||
@ -380,8 +380,6 @@ rsa==4.9
|
||||
# python-jose
|
||||
s3transfer==0.6.0
|
||||
# via boto3
|
||||
schedule==0.6.0
|
||||
# via -r /awx_devel/requirements/requirements.in
|
||||
semantic-version==2.10.0
|
||||
# via setuptools-rust
|
||||
service-identity==21.1.0
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user