@@ 2,98 2,163 @@ import re
import markdown
-"""
-Code modified from:
-https://github.com/r0wb0t/markdown-urlize
-
-A more liberal autolinker
-
-Inspired by Django's urlize function.
-
-Positive examples:
-
->>> import markdown
->>> md = markdown.Markdown(extensions=['urlize'])
-
->>> md.convert('http://example.com/')
-'<p><a href="http://example.com/">http://example.com/</a></p>'
-
->>> md.convert('go to http://example.com')
-'<p>go to <a href="http://example.com">http://example.com</a></p>'
-
->>> md.convert('example.com')
-'<p><a href="http://example.com">example.com</a></p>'
-
->>> md.convert('example.net')
-'<p><a href="http://example.net">example.net</a></p>'
+# Regular expression is meant to match the following pattern:
+#
+# [BEGIN][PROTOCOL]HOST[:PORT][/[PATH]][END]
+#
+# Everything except HOST is meant to be optional, as denoted by square
+# brackets.
+#
+# Patter elements are as follows:
+#
+# BEGIN
+# String preceding the link. Can be empty, or any string that ends
+# in whitespace, '(', or '<'.
+#
+# PROTOCOL
+# Syntax defined in https://tools.ietf.org/html/rfc3986 - for
+# example: 'http://', 'https://', 'ftp://', or 'ftps://'.
+#
+# HOST
+# Host can be one of: IPv4 address, IPv6 address in full form, IPv6
+# address in shortened form (e.g. ::1 vs 0:....:0:1 or any
+# combination of), FQDN-like entry (dot-separated domain
+# components), or string 'localhost'.
+#
+# PORT
+# Port should be a numeric value. Keep in mind that it must be
+# preceded with the colon (':').
+#
+# PATH
+# Additional PATH, including any GET parameters that should be part
+# of the URL.
+#
+# END
+# String following the link. Can be empty, or any string that ends
+# in whitespace, ')', or '>'. If ')', then must match with '(' in
+# BEGIN. If '>', then must match with '<' in BEGIN.
+#
+# It should be noted that there are some inconsitencies with the below
+# regex, mainly that:
+#
+# - No IPv4 or IPv6 address validation is performed.
+# - Excessively long IPv6 addresses will end-up being matched if the
+# shortened form happens somewhere in the middle of host string.
+#
+# In order to make the regex easier to handle later on, the following
+# named groups are provided:
+#
+# - begin (string coming before the link, including whitespace or
+# brackets).
+# - url (entire URL that can be used, for example, as actual link for
+# href).
+# - protocol (protocol, together with the trailing ://)
+# - host (just the host part)
+# - port (just the port number)
+# - path (path, combined with any additional GET parameters)
+# - end (string coming after the link, including whitespace or
+# brackets)
+#
+URLIZE_RE = (
+ # Links must start at beginning of string, or be preceded with
+ # whitespace, '(', or '<'.
+ r'^(?P<begin>|.*?[\s\(\<])'
->>> md.convert('www.example.us')
-'<p><a href="http://www.example.us">www.example.us</a></p>'
+ r'(?P<url>' # begin url group
->>> md.convert('(www.example.us/path/?name=val)')
-'<p>(<a href="http://www.example.us/path/?name=val">www.example.us/path/?name=val</a>)</p>'
+ # Leading protocol specification.
+ r'(?P<protocol>([A-Z][A-Z0-9+.-]*://|))'
->>> md.convert('go to <http://example.com> now!')
-'<p>go to <a href="http://example.com">http://example.com</a> now!</p>'
+ # Host identifier
+ r'(?P<host>' # begin host identifier group
-Negative examples:
+ r'[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|' # IPv4, match before FQDN
+ r'\[?([A-F0-9]{1,4}:){7}([A-F0-9]{1,4})\]?|' # IPv6, full form
+ r'\[?:(:[A-F0-9]{1,4}){1,6}\]?|' # IPv6, leading zeros removed
+ r'([A-F0-9]{1,4}:){1,6}:([A-F0-9]{1,4}){1,6}|' # IPv6, zeros in middle removed.
+ r'\[?([A-F0-9]{1,4}:){1,6}:\]?|' # IPv6, trailing zeros removed
+ r'\[?::\]?|' # IPv6, just "empty" address
+ r'([A-Z0-9]([A-Z0-9-]{0,61}[A-Z0-9])?\.)+([A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # FQDN
+ r'localhost' # localhost
+ r')' # end host identifier group
->>> md.convert('del.icio.us')
-'<p>del.icio.us</p>'
+ # Optional port
+ r'(:(?P<port>[0-9]+))?'
-"""
+ # Optional trailing slash with path and GET parameters.
+ r'(/(?P<path>[^\s\[\(\]\)\<\>]*))?'
+ r')' # end url group
-# Taken from Django trunk 2f121dfe635b3f497fe1fe03bc8eb97cdf5083b3
-# https://github.com/django/django/blob/master/django/core/validators.py#L47
-URLIZE_RE = (
- r'((?:(?:http|ftp)s?://|www\.)' # http:// or https://
- # domain...
- r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'
- r'localhost|' # localhost...
- r'[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|' # ...or ipv4
- r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
- r'(?::[0-9]+)?' # optional port
- r'(?:/[^\s\[\(\]\)]*(?:\s+|$))?)'
+ # Links must stop at end of string, or be followed by a whitespace, ')', or '>'.
+ r'(?P<end>[\s\)\>].*?|)$'
)
class UrlizePattern(markdown.inlinepatterns.Pattern):
- def __init__(self, pattern, markdown_instance=None):
- markdown.inlinepatterns.Pattern.__init__(
- self,
- pattern,
- markdown_instance=markdown_instance)
- self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern,
- re.DOTALL | re.UNICODE | re.IGNORECASE)
+ def getCompiledRegExp(self):
+ """
+ Return compiled regular expression for matching the URL
+ patterns. We introduce case-insensitive matching in addition
+ to standard matching flags added by parent class.
+ """
- """ Return a link Element given an autolink (`http://example/com`). """
+ # Ensure links are matched only if they stand on their own to avoid bad matches etc.
+ return re.compile(URLIZE_RE, re.DOTALL | re.UNICODE | re.IGNORECASE)
def handleMatch(self, m):
- url = m.group(2)
+ """
+ Processes match found within the text.
+ """
- if url.startswith('<'):
- url = url[1:-1]
+ protocol = m.group('protocol')
+ url = m.group('url')
text = url
- if not url.split('://')[0] in ('http', 'https', 'ftp'):
- if '@' in url and '/' not in url:
- url = 'mailto:' + url
- else:
- url = 'http://' + url
-
+ begin_url = m.group('begin')
+ end_url = m.group('end')
+
+ # If opening and ending character for URL are not the same,
+ # return text unchanged.
+ if begin_url:
+ begin_delimeter = begin_url[-1]
+ else:
+ begin_delimeter = ''
+ if end_url:
+ end_delimeter = end_url[0]
+ else:
+ end_delimeter = ''
+
+ if (
+ begin_delimeter == '<' and end_delimeter != '>' or
+ begin_delimeter == '(' and end_delimeter != ')' or
+ end_delimeter == ')' and begin_delimeter != '(' or
+ end_delimeter == '>' and begin_delimeter != '<'
+ ):
+ return url
+
+ # If no supported protocol is specified, assume plaintext http
+ # and add it to the url.
+ if protocol == '':
+ url = 'http://' + url
+
+ # Convenience link to distinguish external links more easily.
icon = markdown.util.etree.Element("span")
icon.set('class', 'fa fa-external-link')
+ # Link text.
span_text = markdown.util.etree.Element("span")
span_text.text = markdown.util.AtomicString(" " + text)
+
+ # Set-up link itself.
el = markdown.util.etree.Element("a")
el.set('href', url)
el.set('target', '_blank')
el.append(icon)
el.append(span_text)
+
return el
@@ 110,8 175,3 @@ def makeExtension(configs=None):
if configs is None:
configs = {}
return UrlizeExtension(configs=configs)
-
-
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
@@ 0,0 1,264 @@
+import html
+import markdown
+from unittest import mock
+
+import pytest
+
+from wiki.plugins.links.mdx.urlize import makeExtension, UrlizeExtension
+
+
+# Template accepts two strings - href value and link text value.
+EXPECTED_LINK_TEMPLATE = (
+ '<a href="%s" target="_blank">'
+ '<span class="fa fa-external-link">'
+ '</span>'
+ '<span>'
+ ' %s'
+ '</span>'
+ '</a>'
+)
+
+# Template accepts two strings - href value and link text value.
+EXPECTED_PARAGRAPH_TEMPLATE = '<p>%s</p>' % EXPECTED_LINK_TEMPLATE
+
+
+FIXTURE_POSITIVE_MATCHES = [
+ # Test surrounding begin/end characters.
+ (
+ '(example.com)',
+ '<p>(' + EXPECTED_LINK_TEMPLATE % ('http://example.com', 'example.com') + ')</p>'
+ ),
+ (
+ '<example.com>',
+ '<p><' + EXPECTED_LINK_TEMPLATE % ('http://example.com', 'example.com') + '></p>'
+ ),
+
+ # Test protocol specification.
+ (
+ 'http://example.com',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'http://example.com')
+ ),
+ (
+ 'https://example.com',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('https://example.com', 'https://example.com')
+ ),
+ (
+ 'ftp://example.com',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('ftp://example.com', 'ftp://example.com')
+ ),
+ (
+ 'ftps://example.com',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('ftps://example.com', 'ftps://example.com')
+ ),
+ (
+ 'example.com',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
+ ),
+ (
+ 'onion://example.com',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('onion://example.com', 'onion://example.com')
+ ),
+ (
+ 'onion9+.-://example.com',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('onion9+.-://example.com', 'onion9+.-://example.com')
+ ),
+
+ # Test various supported host variations.
+ (
+ '10.10.1.1',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://10.10.1.1', '10.10.1.1')
+ ),
+ (
+ '1122:3344:5566:7788:9900:aabb:ccdd:eeff',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://1122:3344:5566:7788:9900:aabb:ccdd:eeff', '1122:3344:5566:7788:9900:aabb:ccdd:eeff')
+ ),
+ (
+ '1122:3344:5566:7788:9900:AaBb:cCdD:EeFf',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://1122:3344:5566:7788:9900:AaBb:cCdD:EeFf', '1122:3344:5566:7788:9900:AaBb:cCdD:EeFf')
+ ),
+ (
+ '::1',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://::1', '::1')
+ ),
+ (
+ '1::2:3',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://1::2:3', '1::2:3')
+ ),
+ (
+ '1::',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://1::', '1::')
+ ),
+ (
+ '::',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://::', '::')
+ ),
+ (
+ 'example.com',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
+ ),
+ (
+ 'example.horse',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.horse', 'example.horse')
+ ),
+ (
+ 'my.long.domain.example.com',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://my.long.domain.example.com', 'my.long.domain.example.com')
+ ),
+ (
+ 'localhost',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://localhost', 'localhost')
+ ),
+
+ # Test port section.
+ (
+ 'localhost:8000',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://localhost:8000', 'localhost:8000')
+ ),
+ (
+ '10.1.1.1:8000',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://10.1.1.1:8000', '10.1.1.1:8000')
+ ),
+
+ # Test trailing path specification.
+ (
+ 'http://example.com/',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com/', 'http://example.com/')
+ ),
+ (
+ 'http://example.com/my/path',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com/my/path', 'http://example.com/my/path')
+ ),
+ (
+ 'http://example.com/my/path?param1=value1¶m2=value2',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com/my/path?param1=value1&param2=value2', 'http://example.com/my/path?param1=value1&param2=value2')
+ ),
+
+ # Link positioned somewhere within the text, but around whitespace boundary.
+ (
+ 'This is link myhost.example.com',
+ "<p>This is link " + EXPECTED_LINK_TEMPLATE % ('http://myhost.example.com', 'myhost.example.com') + "</p>"
+ ),
+ (
+ 'myhost.example.com is the link',
+ "<p>" + EXPECTED_LINK_TEMPLATE % ('http://myhost.example.com', 'myhost.example.com') + " is the link</p>"
+ ),
+ (
+ 'I have best myhost.example.com link ever',
+ "<p>I have best " + EXPECTED_LINK_TEMPLATE % ('http://myhost.example.com', 'myhost.example.com') + " link ever</p>"
+ ),
+ (
+ 'I have best\nmyhost.example.com link ever',
+ "<p>I have best\n" + EXPECTED_LINK_TEMPLATE % ('http://myhost.example.com', 'myhost.example.com') + " link ever</p>"
+ ),
+]
+
+
+FIXTURE_NEGATIVE_MATCHES = [
+ # Incomplete FQDNs.
+ (
+ 'example.',
+ '<p>example.</p>'
+ ),
+ (
+ '.example .com',
+ '<p>.example .com</p>'
+ ),
+
+ # localhost as part of another word.
+ (
+ 'localhosts',
+ '<p>localhosts</p>'
+ ),
+
+ # Invalid FQDNs.
+ (
+ 'example-.com',
+ '<p>example-.com</p>'
+ ),
+ (
+ '-example.com',
+ '<p>-example.com</p>'
+ ),
+ (
+ 'my.-example.com',
+ '<p>my.-example.com</p>'
+ ),
+
+ # Invalid IPv6 patterns.
+ (
+ '1:2:3:4:5:6:7:8:a', # Use :a, because using a number would match as optional port
+ '<p>1:2:3:4:5:6:7:8:a</p>',
+ ),
+ (
+ '1::2::3',
+ '<p>1::2::3</p>',
+ ),
+ (
+ '::::1',
+ '<p>::::1</p>',
+ ),
+ (
+ '1::::',
+ '<p>1::::</p>',
+ ),
+
+ # Invalid IPv4 patterns.
+ (
+ '1.2.3.4.5',
+ '<p>1.2.3.4.5</p>',
+ ),
+
+ # Invalid protocols.
+ (
+ '9onion://example.com',
+ '<p>9onion://example.com</p>',
+ ),
+ (
+ '-onion://example.com',
+ '<p>-onion://example.com</p>',
+ ),
+ (
+ '+onion://example.com',
+ '<p>+onion://example.com</p>',
+ ),
+ (
+ '.onion://example.com',
+ '<p>.onion://example.com</p>',
+ ),
+]
+
+
+class TestUrlizeExtension:
+
+ def setup_method(self):
+ self.md = markdown.Markdown(extensions=[UrlizeExtension()])
+
+ @pytest.mark.parametrize("markdown_text, expected_output", FIXTURE_POSITIVE_MATCHES)
+ def test_positive_matches(self, markdown_text, expected_output):
+ assert self.md.convert(markdown_text) == expected_output
+
+ @pytest.mark.parametrize("markdown_text, expected_output", FIXTURE_NEGATIVE_MATCHES)
+ def test_negative_matches(self, markdown_text, expected_output):
+ assert self.md.convert(markdown_text) == expected_output
+
+ def test_url_with_non_matching_begin_and_end_ignored(self):
+ assert self.md.convert('(example.com>') == "<p>%s</p>" % html.escape('(example.com>')
+ assert self.md.convert('<example.com)') == "<p>%s</p>" % html.escape('<example.com)')
+ assert self.md.convert('(example.com') == "<p>%s</p>" % html.escape('(example.com')
+ assert self.md.convert('example.com)') == "<p>%s</p>" % html.escape('example.com)')
+ assert self.md.convert('<example.com') == "<p>%s</p>" % html.escape('<example.com')
+ assert self.md.convert('example.com>') == "<p>%s</p>" % html.escape('example.com>')
+
+
+def test_makeExtension_return_value():
+ extension = makeExtension()
+
+ assert isinstance(extension, UrlizeExtension)
+
+
+@mock.patch('wiki.plugins.links.mdx.urlize.UrlizeExtension')
+def test_makeExtension_initialises_using_passed_in_configuration(mock_UrlizeExtension):
+ my_config = mock.Mock()
+ makeExtension(my_config)
+
+ mock_UrlizeExtension.assert_called_once_with(configs=my_config)