From 8cff689b42e4412243c0ec819d15b287c3417c60 Mon Sep 17 00:00:00 2001
From: Branko Majic
Date: Fri, 23 Feb 2018 12:01:53 +0100
Subject: [PATCH] Extend custom urlize markdown extension to support any
protocol specifier and fix handling of opening/closing brackets:
- Updated handling of opening/closing brackets ('()' and '<>') to be
more robust, covering some corner cases, and to leave the brackets
intact in resulting generated HTML (they should still surround the
URL element, and not be removed).
- Added support for any valid protocol specifier, according to RFC
3986.
- Moved entire regex defintion into the URLIZE_RE regex variable.
- Updated existing test cases to match new implementation.
- Added new test cases to cover new matching functionality.
---
src/wiki/plugins/links/mdx/urlize.py | 59 ++++++++++++++++++++--------
tests/plugins/links/test_urlize.py | 37 ++++++++++++++++-
2 files changed, 77 insertions(+), 19 deletions(-)
diff --git a/src/wiki/plugins/links/mdx/urlize.py b/src/wiki/plugins/links/mdx/urlize.py
index fe487628..4498d3e5 100644
--- a/src/wiki/plugins/links/mdx/urlize.py
+++ b/src/wiki/plugins/links/mdx/urlize.py
@@ -4,18 +4,20 @@ import markdown
# Regular expression is meant to match the following pattern:
#
-# [BEGIN_URL][PROTOCOL]HOST[:PORT][/[PATH]][END_URL]
+# [BEGIN][PROTOCOL]HOST[:PORT][/[PATH]][END]
#
# Everything except HOST is meant to be optional, as denoted by square
# brackets.
#
# Patter elements are as follows:
#
-# BEGIN_URL
-# Either '<' or '('.
+# BEGIN
+# String preceding the link. Can be empty, or any string that ends
+# in whitespace, '(', or '<'.
#
# PROTOCOL
-# One of: 'http://', 'https://', 'ftp://', or 'ftps://'.
+# Syntax defined in https://tools.ietf.org/html/rfc3986 - for
+# example: 'http://', 'https://', 'ftp://', or 'ftps://'.
#
# HOST
# Host can be one of: IPv4 address, IPv6 address in full form, IPv6
@@ -31,8 +33,10 @@ import markdown
# Additional PATH, including any GET parameters that should be part
# of the URL.
#
-# END_URL
-# Either '>' or ')'. Should match with same type as BEGIN_URL.
+# END
+# String following the link. Can be empty, or any string that ends
+# in whitespace, ')', or '>'. If ')', then must match with '(' in
+# BEGIN. If '>', then must match with '<' in BEGIN.
#
# It should be noted that there are some inconsitencies with the below
# regex, mainly that:
@@ -44,20 +48,26 @@ import markdown
# In order to make the regex easier to handle later on, the following
# named groups are provided:
#
-# - begin (character denoting beginning of URL)
+# - begin (string coming before the link, including whitespace or
+# brackets).
# - url (entire URL that can be used, for example, as actual link for
# href).
# - protocol (protocol, together with the trailing ://)
# - host (just the host part)
# - port (just the port number)
# - path (path, combined with any additional GET parameters)
-# - end (character denoting end of URL)
+# - end (string coming after the link, including whitespace or
+# brackets)
#
URLIZE_RE = (
- r'(?P[\(\<])?(?P' # begin url group
+ # Links must start at beginning of string, or be preceded with
+ # whitespace, '(', or '<'.
+ r'^(?P|.*?[\s\(\<])'
+
+ r'(?P' # begin url group
# Leading protocol specification.
- r'(?Phttp://|https://|ftp://|ftps://|)'
+ r'(?P([A-Z][A-Z0-9+.-]*://|))'
# Host identifier
r'(?P' # begin host identifier group
@@ -78,7 +88,10 @@ URLIZE_RE = (
# Optional trailing slash with path and GET parameters.
r'(/(?P[^\s\[\(\]\)\<\>]*))?'
- r')(?P[\)\>])?' # end url group
+ r')' # end url group
+
+ # Links must stop at end of string, or be followed by a whitespace, ')', or '>'.
+ r'(?P[\s\)\>].*?|)$'
)
@@ -92,15 +105,13 @@ class UrlizePattern(markdown.inlinepatterns.Pattern):
"""
# Ensure links are matched only if they stand on their own to avoid bad matches etc.
- return re.compile(r'^(|.*?\s)%s(\s.*?|)$' % URLIZE_RE, re.DOTALL | re.UNICODE | re.IGNORECASE)
+ return re.compile(URLIZE_RE, re.DOTALL | re.UNICODE | re.IGNORECASE)
def handleMatch(self, m):
"""
Processes match found within the text.
"""
- matched_string = m.group(0)
-
protocol = m.group('protocol')
url = m.group('url')
@@ -111,12 +122,26 @@ class UrlizePattern(markdown.inlinepatterns.Pattern):
# If opening and ending character for URL are not the same,
# return text unchanged.
- if begin_url == '<' and end_url != '>' or begin_url == '(' and end_url != ')':
- return matched_string
+ if begin_url:
+ begin_delimeter = begin_url[-1]
+ else:
+ begin_delimeter = ''
+ if end_url:
+ end_delimeter = end_url[0]
+ else:
+ end_delimeter = ''
+
+ if (
+ begin_delimeter == '<' and end_delimeter != '>' or
+ begin_delimeter == '(' and end_delimeter != ')' or
+ end_delimeter == ')' and begin_delimeter != '(' or
+ end_delimeter == '>' and begin_delimeter != '<'
+ ):
+ return url
# If no supported protocol is specified, assume plaintext http
# and add it to the url.
- if protocol not in ('http://', 'https://', 'ftp://', 'ftps://'):
+ if protocol == '':
url = 'http://' + url
# Convenience link to distinguish external links more easily.
diff --git a/tests/plugins/links/test_urlize.py b/tests/plugins/links/test_urlize.py
index d77e4017..31acb240 100644
--- a/tests/plugins/links/test_urlize.py
+++ b/tests/plugins/links/test_urlize.py
@@ -26,11 +26,11 @@ FIXTURE_POSITIVE_MATCHES = [
# Test surrounding begin/end characters.
(
'(example.com)',
- EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
+ '(' + EXPECTED_LINK_TEMPLATE % ('http://example.com', 'example.com') + ')
'
),
(
'',
- EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
+ '<' + EXPECTED_LINK_TEMPLATE % ('http://example.com', 'example.com') + '>
'
),
# Test protocol specification.
@@ -54,6 +54,14 @@ FIXTURE_POSITIVE_MATCHES = [
'example.com',
EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
),
+ (
+ 'onion://example.com',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('onion://example.com', 'onion://example.com')
+ ),
+ (
+ 'onion9+.-://example.com',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('onion9+.-://example.com', 'onion9+.-://example.com')
+ ),
# Test various supported host variations.
(
@@ -88,6 +96,10 @@ FIXTURE_POSITIVE_MATCHES = [
'example.com',
EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com')
),
+ (
+ 'example.horse',
+ EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.horse', 'example.horse')
+ ),
(
'my.long.domain.example.com',
EXPECTED_PARAGRAPH_TEMPLATE % ('http://my.long.domain.example.com', 'my.long.domain.example.com')
@@ -196,6 +208,23 @@ FIXTURE_NEGATIVE_MATCHES = [
'1.2.3.4.5
',
),
+ # Invalid protocols.
+ (
+ '9onion://example.com',
+ '9onion://example.com
',
+ ),
+ (
+ '-onion://example.com',
+ '-onion://example.com
',
+ ),
+ (
+ '+onion://example.com',
+ '+onion://example.com
',
+ ),
+ (
+ '.onion://example.com',
+ '.onion://example.com
',
+ ),
]
@@ -215,6 +244,10 @@ class TestUrlizeExtension:
def test_url_with_non_matching_begin_and_end_ignored(self):
assert self.md.convert('(example.com>') == "%s
" % html.escape('(example.com>')
assert self.md.convert('%s
" % html.escape('%s" % html.escape('(example.com')
+ assert self.md.convert('example.com)') == "%s
" % html.escape('example.com)')
+ assert self.md.convert('%s" % html.escape('') == "%s
" % html.escape('example.com>')
def test_makeExtension_return_value():
--
2.45.2