optimize the SCM URL sanitizer regex

\w+ is too greedy for large strings that don't contain URLs
This commit is contained in:
Ryan Petrello 2020-03-10 23:36:13 -04:00
parent 208dbc1f92
commit c95624e27f
No known key found for this signature in database
GPG Key ID: F2AA5F2122351777
2 changed files with 8 additions and 1 deletions

View File

@ -8,7 +8,7 @@ REPLACE_STR = '$encrypted$'
class UriCleaner(object):
REPLACE_STR = REPLACE_STR
SENSITIVE_URI_PATTERN = re.compile(r'(\w+:(\/?\/?)[^\s]+)', re.MULTILINE) # NOQA
SENSITIVE_URI_PATTERN = re.compile(r'(\w{1,20}:(\/?\/?)[^\s]+)', re.MULTILINE) # NOQA
@staticmethod
def remove_sensitive(cleartext):

View File

@ -152,3 +152,10 @@ def test_uri_scm_cleartext_redact_and_replace(test_data):
# Ensure the host didn't get redacted
assert redacted_str.count(uri.host) == test_data['host_occurrences']
@pytest.mark.timeout(1)
def test_large_string_performance():
length = 100000
redacted = UriCleaner.remove_sensitive('x' * length)
assert len(redacted) == length