Remove pandas use csv. Also, remove anonymization

This commit is contained in:
Aparna Karve
2023-02-23 12:42:01 -08:00
committed by John Westcott IV
parent 311cea5a4a
commit 132fe5e443

View File

@@ -3,14 +3,11 @@ import datetime
from django.core.serializers.json import DjangoJSONEncoder from django.core.serializers.json import DjangoJSONEncoder
from awx.main.models.inventory import HostMetric, HostMetricSummaryMonthly from awx.main.models.inventory import HostMetric, HostMetricSummaryMonthly
from awx.main.analytics.collectors import config from awx.main.analytics.collectors import config
from awx.main.utils.encryption import get_encryption_key, Fernet256
from django.utils.encoding import smart_str, smart_bytes
import base64
import json import json
import sys import sys
import tempfile import tempfile
import tarfile import tarfile
import pandas as pd import csv
PREFERRED_ROW_COUNT = 500000 PREFERRED_ROW_COUNT = 500000
@@ -52,7 +49,7 @@ class Command(BaseCommand):
return list_of_queryset return list_of_queryset
def paginated_df(self, options, type, filter_kwargs, offset=0, limit=PREFERRED_ROW_COUNT): def paginated_db_retrieval(self, type, filter_kwargs, offset=0, limit=PREFERRED_ROW_COUNT):
list_of_queryset = [] list_of_queryset = []
if type == 'host_metric': if type == 'host_metric':
result = HostMetric.objects.filter(**filter_kwargs) result = HostMetric.objects.filter(**filter_kwargs)
@@ -61,16 +58,7 @@ class Command(BaseCommand):
result = HostMetricSummaryMonthly.objects.filter(**filter_kwargs) result = HostMetricSummaryMonthly.objects.filter(**filter_kwargs)
list_of_queryset = self.host_metric_summary_monthly_queryset(result, offset, limit) list_of_queryset = self.host_metric_summary_monthly_queryset(result, offset, limit)
df = pd.DataFrame(list_of_queryset) return list_of_queryset
if options['anonymized'] and 'hostname' in df.columns:
key = get_encryption_key('hostname', options.get('anonymized'))
df['hostname'] = df.apply(lambda x: self.obfuscated_hostname(key, x['hostname']), axis=1)
return df
def obfuscated_hostname(self, secret_sauce, hostname):
return self.encrypt_name(secret_sauce, hostname)
def whole_page_count(self, row_count, rows_per_file): def whole_page_count(self, row_count, rows_per_file):
whole_pages = int(row_count / rows_per_file) whole_pages = int(row_count / rows_per_file)
@@ -80,10 +68,16 @@ class Command(BaseCommand):
return whole_pages return whole_pages
def csv_for_tar(self, options, temp_dir, type, filter_kwargs, index=1, offset=0, rows_per_file=PREFERRED_ROW_COUNT): def csv_for_tar(self, options, temp_dir, type, filter_kwargs, index=1, offset=0, rows_per_file=PREFERRED_ROW_COUNT):
df = self.paginated_df(options, type, filter_kwargs, offset, rows_per_file) list_of_queryset = self.paginated_db_retrieval(type, filter_kwargs, offset, rows_per_file)
csv_file = f'{temp_dir}/{type}{index}.csv' csv_file = f'{temp_dir}/{type}{index}.csv'
arcname_file = f'{type}{index}.csv' arcname_file = f'{type}{index}.csv'
df.to_csv(csv_file, index=False)
with open(csv_file, 'w', newline='') as output_file:
keys = list_of_queryset[0].keys() if list_of_queryset else []
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(list_of_queryset)
return csv_file, arcname_file return csv_file, arcname_file
def config_for_tar(self, options, temp_dir): def config_for_tar(self, options, temp_dir):
@@ -94,28 +88,6 @@ class Command(BaseCommand):
f.write(config_json) f.write(config_json)
return config_file, arcname_file return config_file, arcname_file
def encrypt_name(self, key, value):
value = smart_str(value)
f = Fernet256(key)
encrypted = f.encrypt(smart_bytes(value))
b64data = smart_str(base64.b64encode(encrypted))
tokens = ['$encrypted', 'UTF8', 'AESCBC', b64data]
return '$'.join(tokens)
def decrypt_name(self, encryption_key, value):
raw_data = value[len('$encrypted$') :]
# If the encrypted string contains a UTF8 marker, discard it
utf8 = raw_data.startswith('UTF8$')
if utf8:
raw_data = raw_data[len('UTF8$') :]
algo, b64data = raw_data.split('$', 1)
if algo != 'AESCBC':
raise ValueError('unsupported algorithm: %s' % algo)
encrypted = base64.b64decode(b64data)
f = Fernet256(encryption_key)
value = f.decrypt(encrypted)
return smart_str(value)
def output_json(self, options, filter_kwargs): def output_json(self, options, filter_kwargs):
if not options.get('json') or options.get('json') == 'host_metric': if not options.get('json') or options.get('json') == 'host_metric':
result = HostMetric.objects.filter(**filter_kwargs) result = HostMetric.objects.filter(**filter_kwargs)
@@ -184,7 +156,6 @@ class Command(BaseCommand):
parser.add_argument('--json', type=str, const='host_metric', nargs='?', help='Select output as JSON for host_metric or host_metric_summary_monthly') parser.add_argument('--json', type=str, const='host_metric', nargs='?', help='Select output as JSON for host_metric or host_metric_summary_monthly')
parser.add_argument('--csv', type=str, const='host_metric', nargs='?', help='Select output as CSV for host_metric or host_metric_summary_monthly') parser.add_argument('--csv', type=str, const='host_metric', nargs='?', help='Select output as CSV for host_metric or host_metric_summary_monthly')
parser.add_argument('--tarball', action='store_true', help=f'Package CSV files into a tar with upto {PREFERRED_ROW_COUNT} rows') parser.add_argument('--tarball', action='store_true', help=f'Package CSV files into a tar with upto {PREFERRED_ROW_COUNT} rows')
parser.add_argument('--anonymized', type=str, help='Anonymize hostnames with provided salt')
parser.add_argument('--rows_per_file', type=int, help=f'Split rows in chunks of {PREFERRED_ROW_COUNT}') parser.add_argument('--rows_per_file', type=int, help=f'Split rows in chunks of {PREFERRED_ROW_COUNT}')
def handle(self, *args, **options): def handle(self, *args, **options):