~netlandish/django-wiki

2a1745c140e4231240abe68a5c7250920aee505c — Benjamin Bach 6 years ago 9da59ba + 8cff689
Merge pull request #816 from azaghal/fix-urlize-markdown-extension

Fix urlize markdown extension
2 files changed, 391 insertions(+), 67 deletions(-)

M src/wiki/plugins/links/mdx/urlize.py
A tests/plugins/links/test_urlize.py
M src/wiki/plugins/links/mdx/urlize.py => src/wiki/plugins/links/mdx/urlize.py +127 -67
@@ 2,98 2,163 @@ import re

import markdown

"""
Code modified from:
https://github.com/r0wb0t/markdown-urlize

A more liberal autolinker

Inspired by Django's urlize function.

Positive examples:

>>> import markdown
>>> md = markdown.Markdown(extensions=['urlize'])

>>> md.convert('http://example.com/')
'<p><a href="http://example.com/">http://example.com/</a></p>'

>>> md.convert('go to http://example.com')
'<p>go to <a href="http://example.com">http://example.com</a></p>'

>>> md.convert('example.com')
'<p><a href="http://example.com">example.com</a></p>'

>>> md.convert('example.net')
'<p><a href="http://example.net">example.net</a></p>'
# Regular expression is meant to match the following pattern:
#
# [BEGIN][PROTOCOL]HOST[:PORT][/[PATH]][END]
#
# Everything except HOST is meant to be optional, as denoted by square
# brackets.
#
# Patter elements are as follows:
#
# BEGIN
#   String preceding the link. Can be empty, or any string that ends
#   in whitespace, '(', or '<'.
#
# PROTOCOL
#   Syntax defined in https://tools.ietf.org/html/rfc3986 - for
#   example: 'http://', 'https://', 'ftp://', or 'ftps://'.
#
# HOST
#   Host can be one of: IPv4 address, IPv6 address in full form, IPv6
#   address in shortened form (e.g. ::1 vs 0:....:0:1 or any
#   combination of), FQDN-like entry (dot-separated domain
#   components), or string 'localhost'.
#
# PORT
#   Port should be a numeric value. Keep in mind that it must be
#   preceded with the colon (':').
#
# PATH
#   Additional PATH, including any GET parameters that should be part
#   of the URL.
#
# END
#   String following the link. Can be empty, or any string that ends
#   in whitespace, ')', or '>'. If ')', then must match with '(' in
#   BEGIN. If '>', then must match with '<' in BEGIN.
#
# It should be noted that there are some inconsitencies with the below
# regex, mainly that:
#
# - No IPv4 or IPv6 address validation is performed.
# - Excessively long IPv6 addresses will end-up being matched if the
#   shortened form happens somewhere in the middle of host string.
#
# In order to make the regex easier to handle later on, the following
# named groups are provided:
#
# - begin (string coming before the link, including whitespace or
#   brackets).
# - url (entire URL that can be used, for example, as actual link for
#   href).
# - protocol (protocol, together with the trailing ://)
# - host (just the host part)
# - port (just the port number)
# - path (path, combined with any additional GET parameters)
# - end (string coming after the link, including whitespace or
#   brackets)
#
URLIZE_RE = (
    # Links must start at beginning of string, or be preceded with
    # whitespace, '(', or '<'.
    r'^(?P<begin>|.*?[\s\(\<])'

>>> md.convert('www.example.us')
'<p><a href="http://www.example.us">www.example.us</a></p>'
    r'(?P<url>'  # begin url group

>>> md.convert('(www.example.us/path/?name=val)')
'<p>(<a href="http://www.example.us/path/?name=val">www.example.us/path/?name=val</a>)</p>'
    # Leading protocol specification.
    r'(?P<protocol>([A-Z][A-Z0-9+.-]*://|))'

>>> md.convert('go to <http://example.com> now!')
'<p>go to <a href="http://example.com">http://example.com</a> now!</p>'
    # Host identifier
    r'(?P<host>'  # begin host identifier group

Negative examples:
    r'[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|'  # IPv4, match before FQDN
    r'\[?([A-F0-9]{1,4}:){7}([A-F0-9]{1,4})\]?|'  # IPv6, full form
    r'\[?:(:[A-F0-9]{1,4}){1,6}\]?|'  # IPv6, leading zeros removed
    r'([A-F0-9]{1,4}:){1,6}:([A-F0-9]{1,4}){1,6}|'  # IPv6, zeros in middle removed.
    r'\[?([A-F0-9]{1,4}:){1,6}:\]?|'  # IPv6, trailing zeros removed
    r'\[?::\]?|'  # IPv6, just "empty" address
    r'([A-Z0-9]([A-Z0-9-]{0,61}[A-Z0-9])?\.)+([A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # FQDN
    r'localhost'  # localhost
    r')'  # end host identifier group

>>> md.convert('del.icio.us')
'<p>del.icio.us</p>'
    # Optional port
    r'(:(?P<port>[0-9]+))?'

"""
    # Optional trailing slash with path and GET parameters.
    r'(/(?P<path>[^\s\[\(\]\)\<\>]*))?'

    r')'  # end url group

# Taken from Django trunk 2f121dfe635b3f497fe1fe03bc8eb97cdf5083b3
# https://github.com/django/django/blob/master/django/core/validators.py#L47
URLIZE_RE = (
    r'((?:(?:http|ftp)s?://|www\.)'  # http:// or https://
    # domain...
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'
    r'localhost|'  # localhost...
    r'[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|'  # ...or ipv4
    r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
    r'(?::[0-9]+)?'  # optional port
    r'(?:/[^\s\[\(\]\)]*(?:\s+|$))?)'
    # Links must stop at end of string, or be followed by a whitespace, ')', or '>'.
    r'(?P<end>[\s\)\>].*?|)$'
)


class UrlizePattern(markdown.inlinepatterns.Pattern):

    def __init__(self, pattern, markdown_instance=None):
        markdown.inlinepatterns.Pattern.__init__(
            self,
            pattern,
            markdown_instance=markdown_instance)
        self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern,
                                      re.DOTALL | re.UNICODE | re.IGNORECASE)
    def getCompiledRegExp(self):
        """
        Return compiled regular expression for matching the URL
        patterns. We introduce case-insensitive matching in addition
        to standard matching flags added by parent class.
        """

    """ Return a link Element given an autolink (`http://example/com`). """
        # Ensure links are matched only if they stand on their own to avoid bad matches etc.
        return re.compile(URLIZE_RE, re.DOTALL | re.UNICODE | re.IGNORECASE)

    def handleMatch(self, m):
        url = m.group(2)
        """
        Processes match found within the text.
        """

        if url.startswith('<'):
            url = url[1:-1]
        protocol = m.group('protocol')

        url = m.group('url')
        text = url

        if not url.split('://')[0] in ('http', 'https', 'ftp'):
            if '@' in url and '/' not in url:
                url = 'mailto:' + url
            else:
                url = 'http://' + url

        begin_url = m.group('begin')
        end_url = m.group('end')

        # If opening and ending character for URL are not the same,
        # return text unchanged.
        if begin_url:
            begin_delimeter = begin_url[-1]
        else:
            begin_delimeter = ''
        if end_url:
            end_delimeter = end_url[0]
        else:
            end_delimeter = ''

        if (
                begin_delimeter == '<' and end_delimeter != '>' or
                begin_delimeter == '(' and end_delimeter != ')' or
                end_delimeter == ')' and begin_delimeter != '(' or
                end_delimeter == '>' and begin_delimeter != '<'
        ):
            return url

        # If no supported protocol is specified, assume plaintext http
        # and add it to the url.
        if protocol == '':
            url = 'http://' + url

        # Convenience link to distinguish external links more easily.
        icon = markdown.util.etree.Element("span")
        icon.set('class', 'fa fa-external-link')

        # Link text.
        span_text = markdown.util.etree.Element("span")
        span_text.text = markdown.util.AtomicString(" " + text)

        # Set-up link itself.
        el = markdown.util.etree.Element("a")
        el.set('href', url)
        el.set('target', '_blank')
        el.append(icon)
        el.append(span_text)

        return el




@@ 110,8 175,3 @@ def makeExtension(configs=None):
    if configs is None:
        configs = {}
    return UrlizeExtension(configs=configs)


if __name__ == "__main__":
    import doctest
    doctest.testmod()

A tests/plugins/links/test_urlize.py => tests/plugins/links/test_urlize.py +264 -0
@@ 0,0 1,264 @@
import html
import markdown
from unittest import mock

import pytest

from wiki.plugins.links.mdx.urlize import makeExtension, UrlizeExtension


# Template accepts two strings - href value and link text value.
EXPECTED_LINK_TEMPLATE = (
    '<a href="%s" target="_blank">'
    '<span class="fa fa-external-link">'
    '</span>'
    '<span>'
    ' %s'
    '</span>'
    '</a>'
)

# Template accepts two strings - href value and link text value.
EXPECTED_PARAGRAPH_TEMPLATE = '<p>%s</p>' % EXPECTED_LINK_TEMPLATE


FIXTURE_POSITIVE_MATCHES = [
    # Test surrounding begin/end characters.
    (
        '(example.com)',
        '<p>(' + EXPECTED_LINK_TEMPLATE % ('http://example.com', 'example.com') + ')</p>'
    ),
    (
        '<example.com>',
        '<p>&lt;' + EXPECTED_LINK_TEMPLATE % ('http://example.com', 'example.com') + '&gt;</p>'
    ),

    # Test protocol specification.
    (
        'http://example.com',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'http://example.com')
    ),
    (
        'https://example.com',
        EXPECTED_PARAGRAPH_TEMPLATE % ('https://example.com', 'https://example.com')
    ),
    (
        'ftp://example.com',
        EXPECTED_PARAGRAPH_TEMPLATE % ('ftp://example.com', 'ftp://example.com')
    ),
    (
        'ftps://example.com',
        EXPECTED_PARAGRAPH_TEMPLATE % ('ftps://example.com', 'ftps://example.com')
    ),
    (
        'example.com',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
    ),
    (
        'onion://example.com',
        EXPECTED_PARAGRAPH_TEMPLATE % ('onion://example.com', 'onion://example.com')
    ),
    (
        'onion9+.-://example.com',
        EXPECTED_PARAGRAPH_TEMPLATE % ('onion9+.-://example.com', 'onion9+.-://example.com')
    ),

    # Test various supported host variations.
    (
        '10.10.1.1',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://10.10.1.1', '10.10.1.1')
    ),
    (
        '1122:3344:5566:7788:9900:aabb:ccdd:eeff',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://1122:3344:5566:7788:9900:aabb:ccdd:eeff', '1122:3344:5566:7788:9900:aabb:ccdd:eeff')
    ),
    (
        '1122:3344:5566:7788:9900:AaBb:cCdD:EeFf',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://1122:3344:5566:7788:9900:AaBb:cCdD:EeFf', '1122:3344:5566:7788:9900:AaBb:cCdD:EeFf')
    ),
    (
        '::1',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://::1', '::1')
    ),
    (
        '1::2:3',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://1::2:3', '1::2:3')
    ),
    (
        '1::',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://1::', '1::')
    ),
    (
        '::',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://::', '::')
    ),
    (
        'example.com',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
    ),
    (
        'example.horse',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.horse', 'example.horse')
    ),
    (
        'my.long.domain.example.com',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://my.long.domain.example.com', 'my.long.domain.example.com')
    ),
    (
        'localhost',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://localhost', 'localhost')
    ),

    # Test port section.
    (
        'localhost:8000',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://localhost:8000', 'localhost:8000')
    ),
    (
        '10.1.1.1:8000',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://10.1.1.1:8000', '10.1.1.1:8000')
    ),

    # Test trailing path specification.
    (
        'http://example.com/',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com/', 'http://example.com/')
    ),
    (
        'http://example.com/my/path',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com/my/path', 'http://example.com/my/path')
    ),
    (
        'http://example.com/my/path?param1=value1&param2=value2',
        EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com/my/path?param1=value1&amp;param2=value2', 'http://example.com/my/path?param1=value1&amp;param2=value2')
    ),

    # Link positioned somewhere within the text, but around whitespace boundary.
    (
        'This is link myhost.example.com',
        "<p>This is link " + EXPECTED_LINK_TEMPLATE % ('http://myhost.example.com', 'myhost.example.com') + "</p>"
    ),
    (
        'myhost.example.com is the link',
        "<p>" + EXPECTED_LINK_TEMPLATE % ('http://myhost.example.com', 'myhost.example.com') + " is the link</p>"
    ),
    (
        'I have best myhost.example.com link ever',
        "<p>I have best " + EXPECTED_LINK_TEMPLATE % ('http://myhost.example.com', 'myhost.example.com') + " link ever</p>"
    ),
    (
        'I have best\nmyhost.example.com link ever',
        "<p>I have best\n" + EXPECTED_LINK_TEMPLATE % ('http://myhost.example.com', 'myhost.example.com') + " link ever</p>"
    ),
]


FIXTURE_NEGATIVE_MATCHES = [
    # Incomplete FQDNs.
    (
        'example.',
        '<p>example.</p>'
    ),
    (
        '.example .com',
        '<p>.example .com</p>'
    ),

    # localhost as part of another word.
    (
        'localhosts',
        '<p>localhosts</p>'
    ),

    # Invalid FQDNs.
    (
        'example-.com',
        '<p>example-.com</p>'
    ),
    (
        '-example.com',
        '<p>-example.com</p>'
    ),
    (
        'my.-example.com',
        '<p>my.-example.com</p>'
    ),

    # Invalid IPv6 patterns.
    (
        '1:2:3:4:5:6:7:8:a',  # Use :a, because using a number would match as optional port
        '<p>1:2:3:4:5:6:7:8:a</p>',
    ),
    (
        '1::2::3',
        '<p>1::2::3</p>',
    ),
    (
        '::::1',
        '<p>::::1</p>',
    ),
    (
        '1::::',
        '<p>1::::</p>',
    ),

    # Invalid IPv4 patterns.
    (
        '1.2.3.4.5',
        '<p>1.2.3.4.5</p>',
    ),

    # Invalid protocols.
    (
        '9onion://example.com',
        '<p>9onion://example.com</p>',
    ),
    (
        '-onion://example.com',
        '<p>-onion://example.com</p>',
    ),
    (
        '+onion://example.com',
        '<p>+onion://example.com</p>',
    ),
    (
        '.onion://example.com',
        '<p>.onion://example.com</p>',
    ),
]


class TestUrlizeExtension:

    def setup_method(self):
        self.md = markdown.Markdown(extensions=[UrlizeExtension()])

    @pytest.mark.parametrize("markdown_text, expected_output", FIXTURE_POSITIVE_MATCHES)
    def test_positive_matches(self, markdown_text, expected_output):
        assert self.md.convert(markdown_text) == expected_output

    @pytest.mark.parametrize("markdown_text, expected_output", FIXTURE_NEGATIVE_MATCHES)
    def test_negative_matches(self, markdown_text, expected_output):
        assert self.md.convert(markdown_text) == expected_output

    def test_url_with_non_matching_begin_and_end_ignored(self):
        assert self.md.convert('(example.com>') == "<p>%s</p>" % html.escape('(example.com>')
        assert self.md.convert('<example.com)') == "<p>%s</p>" % html.escape('<example.com)')
        assert self.md.convert('(example.com') == "<p>%s</p>" % html.escape('(example.com')
        assert self.md.convert('example.com)') == "<p>%s</p>" % html.escape('example.com)')
        assert self.md.convert('<example.com') == "<p>%s</p>" % html.escape('<example.com')
        assert self.md.convert('example.com>') == "<p>%s</p>" % html.escape('example.com>')


def test_makeExtension_return_value():
    extension = makeExtension()

    assert isinstance(extension, UrlizeExtension)


@mock.patch('wiki.plugins.links.mdx.urlize.UrlizeExtension')
def test_makeExtension_initialises_using_passed_in_configuration(mock_UrlizeExtension):
    my_config = mock.Mock()
    makeExtension(my_config)

    mock_UrlizeExtension.assert_called_once_with(configs=my_config)