Add customizable batch_size for cleanup_activitystream and cleanup_jobs (#14412)

Signed-off-by: Daniel Gonçalves <daniel.gonc@lves.fr>
This commit is contained in:
Daniel Gonçalves 2023-10-11 22:09:16 +02:00 committed by GitHub
parent 19ca480078
commit 56878b4910
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 13 additions and 5 deletions

View File

@ -24,6 +24,9 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--days', dest='days', type=int, default=90, metavar='N', help='Remove activity stream events more than N days old')
parser.add_argument('--dry-run', dest='dry_run', action='store_true', default=False, help='Dry run mode (show items that would be removed)')
parser.add_argument(
'--batch-size', dest='batch_size', type=int, default=500, metavar='X', help='Remove activity stream events in batch of X events. Defaults to 500.'
)
def init_logging(self):
log_levels = dict(enumerate([logging.ERROR, logging.INFO, logging.DEBUG, 0]))
@ -48,7 +51,7 @@ class Command(BaseCommand):
else:
pks_to_delete.add(asobj.pk)
# Cleanup objects in batches instead of deleting each one individually.
if len(pks_to_delete) >= 500:
if len(pks_to_delete) >= self.batch_size:
ActivityStream.objects.filter(pk__in=pks_to_delete).delete()
n_deleted_items += len(pks_to_delete)
pks_to_delete.clear()
@ -63,4 +66,5 @@ class Command(BaseCommand):
self.days = int(options.get('days', 30))
self.cutoff = now() - datetime.timedelta(days=self.days)
self.dry_run = bool(options.get('dry_run', False))
self.batch_size = int(options.get('batch_size', 500))
self.cleanup_activitystream()

View File

@ -150,6 +150,9 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--days', dest='days', type=int, default=90, metavar='N', help='Remove jobs/updates executed more than N days ago. Defaults to 90.')
parser.add_argument('--dry-run', dest='dry_run', action='store_true', default=False, help='Dry run mode (show items that would be removed)')
parser.add_argument(
'--batch-size', dest='batch_size', type=int, default=100000, metavar='X', help='Remove jobs in batch of X jobs. Defaults to 100000.'
)
parser.add_argument('--jobs', dest='only_jobs', action='store_true', default=False, help='Remove jobs')
parser.add_argument('--ad-hoc-commands', dest='only_ad_hoc_commands', action='store_true', default=False, help='Remove ad hoc commands')
parser.add_argument('--project-updates', dest='only_project_updates', action='store_true', default=False, help='Remove project updates')
@ -226,8 +229,6 @@ class Command(BaseCommand):
cursor.execute(f'DROP TABLE _unpartitioned_{tblname}')
def cleanup_jobs(self):
batch_size = 100000
# Hack to avoid doing N+1 queries as each item in the Job query set does
# an individual query to get the underlying UnifiedJob.
Job.polymorphic_super_sub_accessors_replaced = True
@ -242,8 +243,8 @@ class Command(BaseCommand):
deleted = 0
info = qs.aggregate(min=Min('id'), max=Max('id'))
if info['min'] is not None:
for start in range(info['min'], info['max'] + 1, batch_size):
qs_batch = qs.filter(id__gte=start, id__lte=start + batch_size)
for start in range(info['min'], info['max'] + 1, self.batch_size):
qs_batch = qs.filter(id__gte=start, id__lte=start + self.batch_size)
pk_list = qs_batch.values_list('id', flat=True)
_, results = qs_batch.delete()
@ -402,6 +403,7 @@ class Command(BaseCommand):
self.init_logging()
self.days = int(options.get('days', 90))
self.dry_run = bool(options.get('dry_run', False))
self.batch_size = int(options.get('batch_size', 100000))
try:
self.cutoff = now() - datetime.timedelta(days=self.days)
except OverflowError:

View File

@ -1873,6 +1873,8 @@ class RunSystemJob(BaseTask):
if system_job.job_type in ('cleanup_jobs', 'cleanup_activitystream'):
if 'days' in json_vars:
args.extend(['--days', str(json_vars.get('days', 60))])
if 'batch_size' in json_vars:
args.extend(['--batch-size', str(json_vars['batch_size'])])
if 'dry_run' in json_vars and json_vars['dry_run']:
args.extend(['--dry-run'])
if system_job.job_type == 'cleanup_jobs':