From 919975792d814d5144b9174ae70743dc27490169 Mon Sep 17 00:00:00 2001 From: Branko Majic Date: Wed, 21 Feb 2018 16:34:05 +0100 Subject: [PATCH] Fix internal urlize markdown extension: - Added comprehensive tests for the extension. - Rewrote the extension to have better behaviour. - Fixed issues where URL path/GET parameters would end-up not being part of the link. - Make the extension more flexible in terms of matching FQDNs that do not start with www. - Added inline documentation to make it clearer what is expected from the extension. --- src/wiki/plugins/links/mdx/urlize.py | 170 +++++++++++++++----------- tests/plugins/links/test_urlize.py | 174 +++++++++++++++++++++++++++ 2 files changed, 276 insertions(+), 68 deletions(-) create mode 100644 tests/plugins/links/test_urlize.py diff --git a/src/wiki/plugins/links/mdx/urlize.py b/src/wiki/plugins/links/mdx/urlize.py index d02fc4d7..3dc630c6 100644 --- a/src/wiki/plugins/links/mdx/urlize.py +++ b/src/wiki/plugins/links/mdx/urlize.py @@ -2,98 +2,137 @@ import re import markdown -""" -Code modified from: -https://github.com/r0wb0t/markdown-urlize - -A more liberal autolinker - -Inspired by Django's urlize function. - -Positive examples: - ->>> import markdown ->>> md = markdown.Markdown(extensions=['urlize']) - ->>> md.convert('http://example.com/') -'

http://example.com/

' - ->>> md.convert('go to http://example.com') -'

go to http://example.com

' - ->>> md.convert('example.com') -'

example.com

' - ->>> md.convert('example.net') -'

example.net

' - ->>> md.convert('www.example.us') -'

www.example.us

' - ->>> md.convert('(www.example.us/path/?name=val)') -'

(www.example.us/path/?name=val)

' +# Regular expression is meant to match the following pattern: +# +# [BEGIN_URL][PROTOCOL]HOST[:PORT][/[PATH]][END_URL] +# +# Everything except HOST is meant to be optional, as denoted by square +# brackets. +# +# Patter elements are as follows: +# +# BEGIN_URL +# Either '<' or '('. +# +# PROTOCOL +# One of: 'http://', 'https://', 'ftp://', or 'ftps://'. +# +# HOST +# Host can be one of: IPv4 address, IPv6 address in full form, IPv6 +# address in shortened form (e.g. ::1 vs 0:....:0:1 or any +# combination of), FQDN-like entry (dot-separated domain +# components), or string 'localhost'. +# +# PORT +# Port should be a numeric value. Keep in mind that it must be +# preceded with the colon (':'). +# +# PATH +# Additional PATH, including any GET parameters that should be part +# of the URL. +# +# END_URL +# Either '>' or ')'. Should match with same type as BEGIN_URL. +# +# It should be noted that there are some inconsitencies with the below +# regex, mainly that: +# +# - No IPv4 or IPv6 address validation is performed. +# - Excessively long IPv6 addresses will end-up being matched if the +# shortened form happens somewhere in the middle of host string. +# +# In order to make the regex easier to handle later on, the following +# named groups are provided: +# +# - begin (character denoting beginning of URL) +# - url (entire URL that can be used, for example, as actual link for +# href). +# - protocol (protocol, together with the trailing ://) +# - host (just the host part) +# - port (just the port number) +# - path (path, combined with any additional GET parameters) +# - end (character denoting end of URL) +# +URLIZE_RE = ( + r'(?P[\(\<])?(?P' # begin url group ->>> md.convert('go to now!') -'

go to http://example.com now!

' + # Leading protocol specification. + r'(?Phttp://|https://|ftp://|ftps://|)' -Negative examples: + # Host identifier + r'(?P' # begin host identifier group ->>> md.convert('del.icio.us') -'

del.icio.us

' + r'[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|' # IPv4, match before FQDN + r'\[?[A-F0-9]{1,4}:([A-F0-9]{1,4}:){6}[A-F0-9]{1,4}\]?|' # IPv6, full form + r'\[?:(:[A-F0-9]{1,4}){1,6}\]?|' # IPv6, leading zeros removed + r'([A-F0-9]{1,4}:){1,6}:([A-F0-9]{1,4}){1,6}|' # IPv6, zeros in middle removed. + r'\[?([A-F0-9]{1,4}:){1,6}:\]?|' # IPv6, trailing zeros removed + r'\[?::\]?|' # IPv6, just "empty" address + r'([A-Z0-9]([A-Z0-9-]{0,61}[A-Z0-9])?\.)+([A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # FQDN + r'localhost' # localhost + r')' # end host identifier group -""" + # Optional port + r'(:(?P[0-9]+))?' + # Optional trailing slash with path and GET parameters. + r'(/(?P[^\s\[\(\]\)\<\>]*))?' -# Taken from Django trunk 2f121dfe635b3f497fe1fe03bc8eb97cdf5083b3 -# https://github.com/django/django/blob/master/django/core/validators.py#L47 -URLIZE_RE = ( - r'((?:(?:http|ftp)s?://|www\.)' # http:// or https:// - # domain... - r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' - r'localhost|' # localhost... - r'[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|' # ...or ipv4 - r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6 - r'(?::[0-9]+)?' # optional port - r'(?:/[^\s\[\(\]\)]*(?:\s+|$))?)' + r')(?P[\)\>])?' # end url group ) class UrlizePattern(markdown.inlinepatterns.Pattern): - def __init__(self, pattern, markdown_instance=None): - markdown.inlinepatterns.Pattern.__init__( - self, - pattern, - markdown_instance=markdown_instance) - self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, - re.DOTALL | re.UNICODE | re.IGNORECASE) + def getCompiledRegExp(self): + """ + Return compiled regular expression for matching the URL + patterns. We introduce case-insensitive matching in addition + to standard matching flags added by parent class. + """ - """ Return a link Element given an autolink (`http://example/com`). """ + return re.compile(r'^(.*?)%s(.*?)$' % URLIZE_RE, re.DOTALL | re.UNICODE | re.IGNORECASE) def handleMatch(self, m): - url = m.group(2) + """ + Processes match found within the text. + """ + + matched_string = m.group(0) - if url.startswith('<'): - url = url[1:-1] + protocol = m.group('protocol') + url = m.group('url') text = url - if not url.split('://')[0] in ('http', 'https', 'ftp'): - if '@' in url and '/' not in url: - url = 'mailto:' + url - else: - url = 'http://' + url + begin_url = m.group('begin') + end_url = m.group('end') + + # If opening and ending character for URL are not the same, + # return text unchanged. + if begin_url == '<' and end_url != '>' or begin_url == '(' and end_url != ')': + return matched_string + + # If no supported protocol is specified, assume plaintext http + # and add it to the url. + if protocol not in ('http://', 'https://', 'ftp://', 'ftps://'): + url = 'http://' + url + # Convenience link to distinguish external links more easily. icon = markdown.util.etree.Element("span") icon.set('class', 'fa fa-external-link') + # Link text. span_text = markdown.util.etree.Element("span") span_text.text = markdown.util.AtomicString(" " + text) + + # Set-up link itself. el = markdown.util.etree.Element("a") el.set('href', url) el.set('target', '_blank') el.append(icon) el.append(span_text) + return el @@ -110,8 +149,3 @@ def makeExtension(configs=None): if configs is None: configs = {} return UrlizeExtension(configs=configs) - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/tests/plugins/links/test_urlize.py b/tests/plugins/links/test_urlize.py new file mode 100644 index 00000000..079c123a --- /dev/null +++ b/tests/plugins/links/test_urlize.py @@ -0,0 +1,174 @@ +import html +import markdown +from unittest import mock + +import pytest + +from wiki.plugins.links.mdx.urlize import makeExtension, UrlizeExtension + + +# Template accepts two strings - href value and link text value. +EXPECTED_LINK_TEMPLATE = ( + '' + '' + '' + '' + ' %s' + '' + '' +) + +# Template accepts two strings - href value and link text value. +EXPECTED_PARAGRAPH_TEMPLATE = '

%s

' % EXPECTED_LINK_TEMPLATE + + +FIXTURE_POSITIVE_MATCHES = [ + # Test surrounding begin/end characters. + ( + '(example.com)', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com') + ), + ( + '', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com') + ), + + # Test protocol specification. + ( + 'http://example.com', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'http://example.com') + ), + ( + 'https://example.com', + EXPECTED_PARAGRAPH_TEMPLATE % ('https://example.com', 'https://example.com') + ), + ( + 'ftp://example.com', + EXPECTED_PARAGRAPH_TEMPLATE % ('ftp://example.com', 'ftp://example.com') + ), + ( + 'ftps://example.com', + EXPECTED_PARAGRAPH_TEMPLATE % ('ftps://example.com', 'ftps://example.com') + ), + ( + 'example.com', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com') + ), + + # Test various supported host variations. + ( + '10.10.1.1', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://10.10.1.1', '10.10.1.1') + ), + ( + '1122:3344:5566:7788:9900:aabb:ccdd:eeff', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://1122:3344:5566:7788:9900:aabb:ccdd:eeff', '1122:3344:5566:7788:9900:aabb:ccdd:eeff') + ), + ( + '1122:3344:5566:7788:9900:AaBb:cCdD:EeFf', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://1122:3344:5566:7788:9900:AaBb:cCdD:EeFf', '1122:3344:5566:7788:9900:AaBb:cCdD:EeFf') + ), + ( + '::1', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://::1', '::1') + ), + ( + '1::2:3', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://1::2:3', '1::2:3') + ), + ( + '1::', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://1::', '1::') + ), + ( + '::', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://::', '::') + ), + ( + 'example.com', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com') + ), + ( + 'my.long.domain.example.com', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://my.long.domain.example.com', 'my.long.domain.example.com') + ), + ( + 'localhost', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://localhost', 'localhost') + ), + + # Test port section. + ( + 'localhost:8000', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://localhost:8000', 'localhost:8000') + ), + ( + '10.1.1.1:8000', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://10.1.1.1:8000', '10.1.1.1:8000') + ), + + # Test trailing path specification. + ( + 'http://example.com/', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com/', 'http://example.com/') + ), + ( + 'http://example.com/my/path', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com/my/path', 'http://example.com/my/path') + ), + ( + 'http://example.com/my/path?param1=value1¶m2=value2', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com/my/path?param1=value1&param2=value2', 'http://example.com/my/path?param1=value1&param2=value2') + ), +] + + +FIXTURE_NEGATIVE_MATCHES = [ + # Incomplete FQDNs. + ( + 'example.', + '

example.

' + ), + ( + '.example .com', + '

.example .com

' + ), + + # Invalid FQDNs. + ( + 'example-.com', + '

example-.com

' + ), +] + + +class TestUrlizeExtension: + + def setup_method(self): + self.md = markdown.Markdown(extensions=[UrlizeExtension()]) + + @pytest.mark.parametrize("markdown_text, expected_output", FIXTURE_POSITIVE_MATCHES) + def test_positive_matches(self, markdown_text, expected_output): + assert self.md.convert(markdown_text) == expected_output + + @pytest.mark.parametrize("markdown_text, expected_output", FIXTURE_NEGATIVE_MATCHES) + def test_negative_matches(self, markdown_text, expected_output): + assert self.md.convert(markdown_text) == expected_output + + def test_url_with_non_matching_begin_and_end_ignored(self): + assert self.md.convert('(example.com>') == "

%s

" % html.escape('(example.com>') + assert self.md.convert('