~netlandish/django-wiki

8cff689b42e4412243c0ec819d15b287c3417c60 — Branko Majic 6 years ago 52ee516
Extend custom urlize markdown extension to support any protocol specifier and fix handling of opening/closing brackets:

- Updated handling of opening/closing brackets ('()' and '<>') to be
  more robust, covering some corner cases, and to leave the brackets
  intact in resulting generated HTML (they should still surround the
  URL element, and not be removed).
- Added support for any valid protocol specifier, according to RFC
  3986.
- Moved entire regex defintion into the URLIZE_RE regex variable.
- Updated existing test cases to match new implementation.
- Added new test cases to cover new matching functionality.
2 files changed, 77 insertions(+), 19 deletions(-)

M src/wiki/plugins/links/mdx/urlize.py
M tests/plugins/links/test_urlize.py
M src/wiki/plugins/links/mdx/urlize.py => src/wiki/plugins/links/mdx/urlize.py +42 -17
@@ 4,18 4,20 @@ import markdown

# Regular expression is meant to match the following pattern:
#
# [BEGIN_URL][PROTOCOL]HOST[:PORT][/[PATH]][END_URL]
# [BEGIN][PROTOCOL]HOST[:PORT][/[PATH]][END]
#
# Everything except HOST is meant to be optional, as denoted by square
# brackets.
#
# Patter elements are as follows:
#
# BEGIN_URL
#   Either '<' or '('.
# BEGIN
#   String preceding the link. Can be empty, or any string that ends
#   in whitespace, '(', or '<'.
#
# PROTOCOL
#   One of: 'http://', 'https://', 'ftp://', or 'ftps://'.
#   Syntax defined in https://tools.ietf.org/html/rfc3986 - for
#   example: 'http://', 'https://', 'ftp://', or 'ftps://'.
#
# HOST
#   Host can be one of: IPv4 address, IPv6 address in full form, IPv6


@@ 31,8 33,10 @@ import markdown
#   Additional PATH, including any GET parameters that should be part
#   of the URL.
#
# END_URL
#   Either '>' or ')'. Should match with same type as BEGIN_URL.
# END
#   String following the link. Can be empty, or any string that ends
#   in whitespace, ')', or '>'. If ')', then must match with '(' in
#   BEGIN. If '>', then must match with '<' in BEGIN.
#
# It should be noted that there are some inconsitencies with the below
# regex, mainly that:


@@ 44,20 48,26 @@ import markdown
# In order to make the regex easier to handle later on, the following
# named groups are provided:
#
# - begin (character denoting beginning of URL)
# - begin (string coming before the link, including whitespace or
#   brackets).
# - url (entire URL that can be used, for example, as actual link for
#   href).
# - protocol (protocol, together with the trailing ://)
# - host (just the host part)
# - port (just the port number)
# - path (path, combined with any additional GET parameters)
# - end (character denoting end of URL)
# - end (string coming after the link, including whitespace or
#   brackets)
#
URLIZE_RE = (
    r'(?P<begin>[\(\<])?(?P<url>'  # begin url group
    # Links must start at beginning of string, or be preceded with
    # whitespace, '(', or '<'.
    r'^(?P<begin>|.*?[\s\(\<])'

    r'(?P<url>'  # begin url group

    # Leading protocol specification.
    r'(?P<protocol>http://|https://|ftp://|ftps://|)'
    r'(?P<protocol>([A-Z][A-Z0-9+.-]*://|))'

    # Host identifier
    r'(?P<host>'  # begin host identifier group


@@ 78,7 88,10 @@ URLIZE_RE = (
    # Optional trailing slash with path and GET parameters.
    r'(/(?P<path>[^\s\[\(\]\)\<\>]*))?'

    r')(?P<end>[\)\>])?'  # end url group
    r')'  # end url group

    # Links must stop at end of string, or be followed by a whitespace, ')', or '>'.
    r'(?P<end>[\s\)\>].*?|)$'
)




@@ 92,15 105,13 @@ class UrlizePattern(markdown.inlinepatterns.Pattern):
        """

        # Ensure links are matched only if they stand on their own to avoid bad matches etc.
        return re.compile(r'^(|.*?\s)%s(\s.*?|)$' % URLIZE_RE, re.DOTALL | re.UNICODE | re.IGNORECASE)
        return re.compile(URLIZE_RE, re.DOTALL | re.UNICODE | re.IGNORECASE)

    def handleMatch(self, m):
        """
        Processes match found within the text.
        """

        matched_string = m.group(0)

        protocol = m.group('protocol')

        url = m.group('url')


@@ 111,12 122,26 @@ class UrlizePattern(markdown.inlinepatterns.Pattern):

        # If opening and ending character for URL are not the same,
        # return text unchanged.
        if begin_url == '<' and end_url != '>' or begin_url == '(' and end_url != ')':
            return matched_string
        if begin_url:
            begin_delimeter = begin_url[-1]
        else:
            begin_delimeter = ''
        if end_url:
            end_delimeter = end_url[0]
        else:
            end_delimeter = ''

        if (
                begin_delimeter == '<' and end_delimeter != '>' or
                begin_delimeter == '(' and end_delimeter != ')' or
                end_delimeter == ')' and begin_delimeter != '(' or
                end_delimeter == '>' and begin_delimeter != '<'
        ):
            return url

        # If no supported protocol is specified, assume plaintext http
        # and add it to the url.
        if protocol not in ('http://', 'https://', 'ftp://', 'ftps://'):
        if protocol == '':
            url = 'http://' + url

        # Convenience link to distinguish external links more easily.

M tests/plugins/links/test_urlize.py => tests/plugins/links/test_urlize.py +35 -2
@@ 26,11 26,11 @@ FIXTURE_POSITIVE_MATCHES = [
    # Test surrounding begin/end characters.
    (
        '(example.com)',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
        '<p>(' + EXPECTED_LINK_TEMPLATE % ('http://example.com', 'example.com') + ')</p>'
    ),
    (
        '<example.com>',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
        '<p>&lt;' + EXPECTED_LINK_TEMPLATE % ('http://example.com', 'example.com') + '&gt;</p>'
    ),

    # Test protocol specification.


@@ 54,6 54,14 @@ FIXTURE_POSITIVE_MATCHES = [
        'example.com',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
    ),
    (
        'onion://example.com',
        EXPECTED_PARAGRAPH_TEMPLATE % ('onion://example.com', 'onion://example.com')
    ),
    (
        'onion9+.-://example.com',
        EXPECTED_PARAGRAPH_TEMPLATE % ('onion9+.-://example.com', 'onion9+.-://example.com')
    ),

    # Test various supported host variations.
    (


@@ 89,6 97,10 @@ FIXTURE_POSITIVE_MATCHES = [
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
    ),
    (
        'example.horse',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.horse', 'example.horse')
    ),
    (
        'my.long.domain.example.com',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://my.long.domain.example.com', 'my.long.domain.example.com')
    ),


@@ 196,6 208,23 @@ FIXTURE_NEGATIVE_MATCHES = [
        '<p>1.2.3.4.5</p>',
    ),

    # Invalid protocols.
    (
        '9onion://example.com',
        '<p>9onion://example.com</p>',
    ),
    (
        '-onion://example.com',
        '<p>-onion://example.com</p>',
    ),
    (
        '+onion://example.com',
        '<p>+onion://example.com</p>',
    ),
    (
        '.onion://example.com',
        '<p>.onion://example.com</p>',
    ),
]




@@ 215,6 244,10 @@ class TestUrlizeExtension:
    def test_url_with_non_matching_begin_and_end_ignored(self):
        assert self.md.convert('(example.com>') == "<p>%s</p>" % html.escape('(example.com>')
        assert self.md.convert('<example.com)') == "<p>%s</p>" % html.escape('<example.com)')
        assert self.md.convert('(example.com') == "<p>%s</p>" % html.escape('(example.com')
        assert self.md.convert('example.com)') == "<p>%s</p>" % html.escape('example.com)')
        assert self.md.convert('<example.com') == "<p>%s</p>" % html.escape('<example.com')
        assert self.md.convert('example.com>') == "<p>%s</p>" % html.escape('example.com>')


def test_makeExtension_return_value():