From 919975792d814d5144b9174ae70743dc27490169 Mon Sep 17 00:00:00 2001
From: Branko Majic <branko@majic.rs>
Date: Wed, 21 Feb 2018 16:34:05 +0100
Subject: [PATCH] Fix internal urlize markdown extension:

- Added comprehensive tests for the extension.
- Rewrote the extension to have better behaviour.
- Fixed issues where URL path/GET parameters would end-up not being
  part of the link.
- Make the extension more flexible in terms of matching FQDNs that do
  not start with www.
- Added inline documentation to make it clearer what is expected from
  the extension.
---
 src/wiki/plugins/links/mdx/urlize.py | 170 +++++++++++++++-----------
 tests/plugins/links/test_urlize.py   | 174 +++++++++++++++++++++++++++
 2 files changed, 276 insertions(+), 68 deletions(-)
 create mode 100644 tests/plugins/links/test_urlize.py
diff --git a/src/wiki/plugins/links/mdx/urlize.py b/src/wiki/plugins/links/mdx/urlize.py
index d02fc4d7..3dc630c6 100644
--- a/src/wiki/plugins/links/mdx/urlize.py
+++ b/src/wiki/plugins/links/mdx/urlize.py
@@ -2,98 +2,137 @@ import re
 
 import markdown
 
-"""
-Code modified from:
-https://github.com/r0wb0t/markdown-urlize
-
-A more liberal autolinker
-
-Inspired by Django's urlize function.
-
-Positive examples:
-
->>> import markdown
->>> md = markdown.Markdown(extensions=['urlize'])
-
->>> md.convert('http://example.com/')
-'<p><a href="http://example.com/">http://example.com/</a></p>'
-
->>> md.convert('go to http://example.com')
-'<p>go to <a href="http://example.com">http://example.com</a></p>'
-
->>> md.convert('example.com')
-'<p><a href="http://example.com">example.com</a></p>'
-
->>> md.convert('example.net')
-'<p><a href="http://example.net">example.net</a></p>'
-
->>> md.convert('www.example.us')
-'<p><a href="http://www.example.us">www.example.us</a></p>'
-
->>> md.convert('(www.example.us/path/?name=val)')
-'<p>(<a href="http://www.example.us/path/?name=val">www.example.us/path/?name=val</a>)</p>'
+# Regular expression is meant to match the following pattern:
+#
+# [BEGIN_URL][PROTOCOL]HOST[:PORT][/[PATH]][END_URL]
+#
+# Everything except HOST is meant to be optional, as denoted by square
+# brackets.
+#
+# Patter elements are as follows:
+#
+# BEGIN_URL
+#   Either '<' or '('.
+#
+# PROTOCOL
+#   One of: 'http://', 'https://', 'ftp://', or 'ftps://'.
+#
+# HOST
+#   Host can be one of: IPv4 address, IPv6 address in full form, IPv6
+#   address in shortened form (e.g. ::1 vs 0:....:0:1 or any
+#   combination of), FQDN-like entry (dot-separated domain
+#   components), or string 'localhost'.
+#
+# PORT
+#   Port should be a numeric value. Keep in mind that it must be
+#   preceded with the colon (':').
+#
+# PATH
+#   Additional PATH, including any GET parameters that should be part
+#   of the URL.
+#
+# END_URL
+#   Either '>' or ')'. Should match with same type as BEGIN_URL.
+#
+# It should be noted that there are some inconsitencies with the below
+# regex, mainly that:
+#
+# - No IPv4 or IPv6 address validation is performed.
+# - Excessively long IPv6 addresses will end-up being matched if the
+#   shortened form happens somewhere in the middle of host string.
+#
+# In order to make the regex easier to handle later on, the following
+# named groups are provided:
+#
+# - begin (character denoting beginning of URL)
+# - url (entire URL that can be used, for example, as actual link for
+#   href).
+# - protocol (protocol, together with the trailing ://)
+# - host (just the host part)
+# - port (just the port number)
+# - path (path, combined with any additional GET parameters)
+# - end (character denoting end of URL)
+#
+URLIZE_RE = (
+    r'(?P<begin>[\(\<])?(?P<url>'  # begin url group
 
->>> md.convert('go to <http://example.com> now!')
-'<p>go to <a href="http://example.com">http://example.com</a> now!</p>'
+    # Leading protocol specification.
+    r'(?P<protocol>http://|https://|ftp://|ftps://|)'
 
-Negative examples:
+    # Host identifier
+    r'(?P<host>'  # begin host identifier group
 
->>> md.convert('del.icio.us')
-'<p>del.icio.us</p>'
+    r'[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|'  # IPv4, match before FQDN
+    r'\[?[A-F0-9]{1,4}:([A-F0-9]{1,4}:){6}[A-F0-9]{1,4}\]?|'  # IPv6, full form
+    r'\[?:(:[A-F0-9]{1,4}){1,6}\]?|'  # IPv6, leading zeros removed
+    r'([A-F0-9]{1,4}:){1,6}:([A-F0-9]{1,4}){1,6}|'  # IPv6, zeros in middle removed.
+    r'\[?([A-F0-9]{1,4}:){1,6}:\]?|'  # IPv6, trailing zeros removed
+    r'\[?::\]?|'  # IPv6, just "empty" address
+    r'([A-Z0-9]([A-Z0-9-]{0,61}[A-Z0-9])?\.)+([A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # FQDN
+    r'localhost'  # localhost
+    r')'  # end host identifier group
 
-"""
+    # Optional port
+    r'(:(?P<port>[0-9]+))?'
 
+    # Optional trailing slash with path and GET parameters.
+    r'(/(?P<path>[^\s\[\(\]\)\<\>]*))?'
 
-# Taken from Django trunk 2f121dfe635b3f497fe1fe03bc8eb97cdf5083b3
-# https://github.com/django/django/blob/master/django/core/validators.py#L47
-URLIZE_RE = (
-    r'((?:(?:http|ftp)s?://|www\.)'  # http:// or https://
-    # domain...
-    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'
-    r'localhost|'  # localhost...
-    r'[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|'  # ...or ipv4
-    r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
-    r'(?::[0-9]+)?'  # optional port
-    r'(?:/[^\s\[\(\]\)]*(?:\s+|$))?)'
+    r')(?P<end>[\)\>])?'  # end url group
 )
 
 
 class UrlizePattern(markdown.inlinepatterns.Pattern):
 
-    def __init__(self, pattern, markdown_instance=None):
-        markdown.inlinepatterns.Pattern.__init__(
-            self,
-            pattern,
-            markdown_instance=markdown_instance)
-        self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern,
-                                      re.DOTALL | re.UNICODE | re.IGNORECASE)
+    def getCompiledRegExp(self):
+        """
+        Return compiled regular expression for matching the URL
+        patterns. We introduce case-insensitive matching in addition
+        to standard matching flags added by parent class.
+        """
 
-    """ Return a link Element given an autolink (`http://example/com`). """
+        return re.compile(r'^(.*?)%s(.*?)$' % URLIZE_RE, re.DOTALL | re.UNICODE | re.IGNORECASE)
 
     def handleMatch(self, m):
-        url = m.group(2)
+        """
+        Processes match found within the text.
+        """
+
+        matched_string = m.group(0)
 
-        if url.startswith('<'):
-            url = url[1:-1]
+        protocol = m.group('protocol')
 
+        url = m.group('url')
         text = url
 
-        if not url.split('://')[0] in ('http', 'https', 'ftp'):
-            if '@' in url and '/' not in url:
-                url = 'mailto:' + url
-            else:
-                url = 'http://' + url
+        begin_url = m.group('begin')
+        end_url = m.group('end')
+
+        # If opening and ending character for URL are not the same,
+        # return text unchanged.
+        if begin_url == '<' and end_url != '>' or begin_url == '(' and end_url != ')':
+            return matched_string
+
+        # If no supported protocol is specified, assume plaintext http
+        # and add it to the url.
+        if protocol not in ('http://', 'https://', 'ftp://', 'ftps://'):
+            url = 'http://' + url
 
+        # Convenience link to distinguish external links more easily.
         icon = markdown.util.etree.Element("span")
         icon.set('class', 'fa fa-external-link')
 
+        # Link text.
         span_text = markdown.util.etree.Element("span")
         span_text.text = markdown.util.AtomicString(" " + text)
+
+        # Set-up link itself.
         el = markdown.util.etree.Element("a")
         el.set('href', url)
         el.set('target', '_blank')
         el.append(icon)
         el.append(span_text)
+
         return el
 
 
@@ -110,8 +149,3 @@ def makeExtension(configs=None):
     if configs is None:
         configs = {}
     return UrlizeExtension(configs=configs)
-
-
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
diff --git a/tests/plugins/links/test_urlize.py b/tests/plugins/links/test_urlize.py
new file mode 100644
index 00000000..079c123a
--- /dev/null
+++ b/tests/plugins/links/test_urlize.py
@@ -0,0 +1,174 @@
+import html
+import markdown
+from unittest import mock
+
+import pytest
+
+from wiki.plugins.links.mdx.urlize import makeExtension, UrlizeExtension
+
+
+# Template accepts two strings - href value and link text value.
+EXPECTED_LINK_TEMPLATE = (
+    '<a href="%s" target="_blank">'
+    '<span class="fa fa-external-link">'
+    '</span>'
+    '<span>'
+    ' %s'
+    '</span>'
+    '</a>'
+)
+
+# Template accepts two strings - href value and link text value.
+EXPECTED_PARAGRAPH_TEMPLATE = '<p>%s</p>' % EXPECTED_LINK_TEMPLATE
+
+
+FIXTURE_POSITIVE_MATCHES = [
+    # Test surrounding begin/end characters.
+    (
+        '(example.com)',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
+    ),
+    (
+        '<example.com>',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
+    ),
+
+    # Test protocol specification.
+    (
+        'http://example.com',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'http://example.com')
+    ),
+    (
+        'https://example.com',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('https://example.com', 'https://example.com')
+    ),
+    (
+        'ftp://example.com',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('ftp://example.com', 'ftp://example.com')
+    ),
+    (
+        'ftps://example.com',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('ftps://example.com', 'ftps://example.com')
+    ),
+    (
+        'example.com',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
+    ),
+
+    # Test various supported host variations.
+    (
+        '10.10.1.1',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://10.10.1.1', '10.10.1.1')
+    ),
+    (
+        '1122:3344:5566:7788:9900:aabb:ccdd:eeff',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://1122:3344:5566:7788:9900:aabb:ccdd:eeff', '1122:3344:5566:7788:9900:aabb:ccdd:eeff')
+    ),
+    (
+        '1122:3344:5566:7788:9900:AaBb:cCdD:EeFf',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://1122:3344:5566:7788:9900:AaBb:cCdD:EeFf', '1122:3344:5566:7788:9900:AaBb:cCdD:EeFf')
+    ),
+    (
+        '::1',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://::1', '::1')
+    ),
+    (
+        '1::2:3',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://1::2:3', '1::2:3')
+    ),
+    (
+        '1::',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://1::', '1::')
+    ),
+    (
+        '::',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://::', '::')
+    ),
+    (
+        'example.com',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
+    ),
+    (
+        'my.long.domain.example.com',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://my.long.domain.example.com', 'my.long.domain.example.com')
+    ),
+    (
+        'localhost',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://localhost', 'localhost')
+    ),
+
+    # Test port section.
+    (
+        'localhost:8000',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://localhost:8000', 'localhost:8000')
+    ),
+    (
+        '10.1.1.1:8000',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://10.1.1.1:8000', '10.1.1.1:8000')
+    ),
+
+    # Test trailing path specification.
+    (
+        'http://example.com/',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com/', 'http://example.com/')
+    ),
+    (
+        'http://example.com/my/path',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com/my/path', 'http://example.com/my/path')
+    ),
+    (
+        'http://example.com/my/path?param1=value1&param2=value2',
+        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com/my/path?param1=value1&amp;param2=value2', 'http://example.com/my/path?param1=value1&amp;param2=value2')
+    ),
+]
+
+
+FIXTURE_NEGATIVE_MATCHES = [
+    # Incomplete FQDNs.
+    (
+        'example.',
+        '<p>example.</p>'
+    ),
+    (
+        '.example .com',
+        '<p>.example .com</p>'
+    ),
+
+    # Invalid FQDNs.
+    (
+        'example-.com',
+        '<p>example-.com</p>'
+    ),
+]
+
+
+class TestUrlizeExtension:
+
+    def setup_method(self):
+        self.md = markdown.Markdown(extensions=[UrlizeExtension()])
+
+    @pytest.mark.parametrize("markdown_text, expected_output", FIXTURE_POSITIVE_MATCHES)
+    def test_positive_matches(self, markdown_text, expected_output):
+        assert self.md.convert(markdown_text) == expected_output
+
+    @pytest.mark.parametrize("markdown_text, expected_output", FIXTURE_NEGATIVE_MATCHES)
+    def test_negative_matches(self, markdown_text, expected_output):
+        assert self.md.convert(markdown_text) == expected_output
+
+    def test_url_with_non_matching_begin_and_end_ignored(self):
+        assert self.md.convert('(example.com>') == "<p>%s</p>" % html.escape('(example.com>')
+        assert self.md.convert('<example.com)') == "<p>%s</p>" % html.escape('<example.com)')
+
+
+def test_makeExtension_return_value():
+    extension = makeExtension()
+
+    assert isinstance(extension, UrlizeExtension)
+
+
+@mock.patch('wiki.plugins.links.mdx.urlize.UrlizeExtension')
+def test_makeExtension_initialises_using_passed_in_configuration(mock_UrlizeExtension):
+    my_config = mock.Mock()
+    makeExtension(my_config)
+
+    mock_UrlizeExtension.assert_called_once_with(configs=my_config)
-- 
2.45.2