remove infinite loop regex

* Fancy url finding regex can result in infinite loop for malformed ipv6 urls so replace it with a more nieve regex that can overmatch. * regex's that find malformed ipv6 urls will be passed to urlparse. This can result in a parsing/ValueError. For these cases we redact the entire found URI.
2026-04-06 02:29:21 -02:30 · 2018-04-13 15:13:20 -04:00
parent b1f4fb3a98
commit 04693ecb0f
2 changed files with 83 additions and 61 deletions
--- a/awx/main/redact.py
+++ b/awx/main/redact.py
@@ -6,8 +6,7 @@ REPLACE_STR = '$encrypted$'

 class UriCleaner(object):
    REPLACE_STR = REPLACE_STR
-    # https://regex101.com/r/sV2dO2/2
-    SENSITIVE_URI_PATTERN = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))', re.MULTILINE)  # NOQA
+    SENSITIVE_URI_PATTERN = re.compile(ur'(\w+:(\/?\/?)[^\s]+)', re.MULTILINE)  # NOQA

    @staticmethod
    def remove_sensitive(cleartext):
@@ -17,38 +16,46 @@ class UriCleaner(object):
            match = UriCleaner.SENSITIVE_URI_PATTERN.search(redactedtext, text_index)
            if not match:
                break
-            o = urlparse.urlsplit(match.group(1))
-            if not o.username and not o.password:
-                if o.netloc and ":" in o.netloc:
-                    # Handle the special case url http://username:password that can appear in SCM url
-                    # on account of a bug? in ansible redaction
-                    (username, password) = o.netloc.split(':')
+            try:
+                uri_str = match.group(1)
+                # May raise a ValueError if invalid URI for one reason or another
+                o = urlparse.urlsplit(uri_str)
+
+                if not o.username and not o.password:
+                    if o.netloc and ":" in o.netloc:
+                        # Handle the special case url http://username:password that can appear in SCM url
+                        # on account of a bug? in ansible redaction
+                        (username, password) = o.netloc.split(':')
+                    else:
+                        text_index += len(match.group(1))
+                        continue
                else:
-                    text_index += len(match.group(1))
-                    continue
-            else:
-                username = o.username
-                password = o.password
+                    username = o.username
+                    password = o.password

-            # Given a python MatchObject, with respect to redactedtext, find and
-            # replace the first occurance of username and the first and second
-            # occurance of password
+                # Given a python MatchObject, with respect to redactedtext, find and
+                # replace the first occurance of username and the first and second
+                # occurance of password

-            uri_str = redactedtext[match.start():match.end()]
-            if username:
-                uri_str = uri_str.replace(username, UriCleaner.REPLACE_STR, 1)
-            # 2, just in case the password is $encrypted$
-            if password:
-                uri_str = uri_str.replace(password, UriCleaner.REPLACE_STR, 2)
+                uri_str = redactedtext[match.start():match.end()]
+                if username:
+                    uri_str = uri_str.replace(username, UriCleaner.REPLACE_STR, 1)
+                # 2, just in case the password is $encrypted$
+                if password:
+                    uri_str = uri_str.replace(password, UriCleaner.REPLACE_STR, 2)

-            t = redactedtext[:match.start()] + uri_str
-            text_index = len(t)
-            if (match.end() < len(redactedtext)):
-                t += redactedtext[match.end():]
+                t = redactedtext[:match.start()] + uri_str
+                text_index = len(t)
+                if (match.end() < len(redactedtext)):
+                    t += redactedtext[match.end():]

-            redactedtext = t
-            if text_index >= len(redactedtext):
-                text_index = len(redactedtext) - 1
+                redactedtext = t
+                if text_index >= len(redactedtext):
+                    text_index = len(redactedtext) - 1
+            except ValueError:
+                # Invalid URI, redact the whole URI to be safe
+                redactedtext = redactedtext[:match.start()] + UriCleaner.REPLACE_STR + redactedtext[match.end():]
+                text_index = match.start() + len(UriCleaner.REPLACE_STR)

        return redactedtext