From 8cff689b42e4412243c0ec819d15b287c3417c60 Mon Sep 17 00:00:00 2001 From: Branko Majic Date: Fri, 23 Feb 2018 12:01:53 +0100 Subject: [PATCH] Extend custom urlize markdown extension to support any protocol specifier and fix handling of opening/closing brackets: - Updated handling of opening/closing brackets ('()' and '<>') to be more robust, covering some corner cases, and to leave the brackets intact in resulting generated HTML (they should still surround the URL element, and not be removed). - Added support for any valid protocol specifier, according to RFC 3986. - Moved entire regex defintion into the URLIZE_RE regex variable. - Updated existing test cases to match new implementation. - Added new test cases to cover new matching functionality. --- src/wiki/plugins/links/mdx/urlize.py | 59 ++++++++++++++++++++-------- tests/plugins/links/test_urlize.py | 37 ++++++++++++++++- 2 files changed, 77 insertions(+), 19 deletions(-) diff --git a/src/wiki/plugins/links/mdx/urlize.py b/src/wiki/plugins/links/mdx/urlize.py index fe487628..4498d3e5 100644 --- a/src/wiki/plugins/links/mdx/urlize.py +++ b/src/wiki/plugins/links/mdx/urlize.py @@ -4,18 +4,20 @@ import markdown # Regular expression is meant to match the following pattern: # -# [BEGIN_URL][PROTOCOL]HOST[:PORT][/[PATH]][END_URL] +# [BEGIN][PROTOCOL]HOST[:PORT][/[PATH]][END] # # Everything except HOST is meant to be optional, as denoted by square # brackets. # # Patter elements are as follows: # -# BEGIN_URL -# Either '<' or '('. +# BEGIN +# String preceding the link. Can be empty, or any string that ends +# in whitespace, '(', or '<'. # # PROTOCOL -# One of: 'http://', 'https://', 'ftp://', or 'ftps://'. +# Syntax defined in https://tools.ietf.org/html/rfc3986 - for +# example: 'http://', 'https://', 'ftp://', or 'ftps://'. # # HOST # Host can be one of: IPv4 address, IPv6 address in full form, IPv6 @@ -31,8 +33,10 @@ import markdown # Additional PATH, including any GET parameters that should be part # of the URL. # -# END_URL -# Either '>' or ')'. Should match with same type as BEGIN_URL. +# END +# String following the link. Can be empty, or any string that ends +# in whitespace, ')', or '>'. If ')', then must match with '(' in +# BEGIN. If '>', then must match with '<' in BEGIN. # # It should be noted that there are some inconsitencies with the below # regex, mainly that: @@ -44,20 +48,26 @@ import markdown # In order to make the regex easier to handle later on, the following # named groups are provided: # -# - begin (character denoting beginning of URL) +# - begin (string coming before the link, including whitespace or +# brackets). # - url (entire URL that can be used, for example, as actual link for # href). # - protocol (protocol, together with the trailing ://) # - host (just the host part) # - port (just the port number) # - path (path, combined with any additional GET parameters) -# - end (character denoting end of URL) +# - end (string coming after the link, including whitespace or +# brackets) # URLIZE_RE = ( - r'(?P[\(\<])?(?P' # begin url group + # Links must start at beginning of string, or be preceded with + # whitespace, '(', or '<'. + r'^(?P|.*?[\s\(\<])' + + r'(?P' # begin url group # Leading protocol specification. - r'(?Phttp://|https://|ftp://|ftps://|)' + r'(?P([A-Z][A-Z0-9+.-]*://|))' # Host identifier r'(?P' # begin host identifier group @@ -78,7 +88,10 @@ URLIZE_RE = ( # Optional trailing slash with path and GET parameters. r'(/(?P[^\s\[\(\]\)\<\>]*))?' - r')(?P[\)\>])?' # end url group + r')' # end url group + + # Links must stop at end of string, or be followed by a whitespace, ')', or '>'. + r'(?P[\s\)\>].*?|)$' ) @@ -92,15 +105,13 @@ class UrlizePattern(markdown.inlinepatterns.Pattern): """ # Ensure links are matched only if they stand on their own to avoid bad matches etc. - return re.compile(r'^(|.*?\s)%s(\s.*?|)$' % URLIZE_RE, re.DOTALL | re.UNICODE | re.IGNORECASE) + return re.compile(URLIZE_RE, re.DOTALL | re.UNICODE | re.IGNORECASE) def handleMatch(self, m): """ Processes match found within the text. """ - matched_string = m.group(0) - protocol = m.group('protocol') url = m.group('url') @@ -111,12 +122,26 @@ class UrlizePattern(markdown.inlinepatterns.Pattern): # If opening and ending character for URL are not the same, # return text unchanged. - if begin_url == '<' and end_url != '>' or begin_url == '(' and end_url != ')': - return matched_string + if begin_url: + begin_delimeter = begin_url[-1] + else: + begin_delimeter = '' + if end_url: + end_delimeter = end_url[0] + else: + end_delimeter = '' + + if ( + begin_delimeter == '<' and end_delimeter != '>' or + begin_delimeter == '(' and end_delimeter != ')' or + end_delimeter == ')' and begin_delimeter != '(' or + end_delimeter == '>' and begin_delimeter != '<' + ): + return url # If no supported protocol is specified, assume plaintext http # and add it to the url. - if protocol not in ('http://', 'https://', 'ftp://', 'ftps://'): + if protocol == '': url = 'http://' + url # Convenience link to distinguish external links more easily. diff --git a/tests/plugins/links/test_urlize.py b/tests/plugins/links/test_urlize.py index d77e4017..31acb240 100644 --- a/tests/plugins/links/test_urlize.py +++ b/tests/plugins/links/test_urlize.py @@ -26,11 +26,11 @@ FIXTURE_POSITIVE_MATCHES = [ # Test surrounding begin/end characters. ( '(example.com)', - EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com') + '

(' + EXPECTED_LINK_TEMPLATE % ('http://example.com', 'example.com') + ')

' ), ( '', - EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com') + '

<' + EXPECTED_LINK_TEMPLATE % ('http://example.com', 'example.com') + '>

' ), # Test protocol specification. @@ -54,6 +54,14 @@ FIXTURE_POSITIVE_MATCHES = [ 'example.com', EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com') ), + ( + 'onion://example.com', + EXPECTED_PARAGRAPH_TEMPLATE % ('onion://example.com', 'onion://example.com') + ), + ( + 'onion9+.-://example.com', + EXPECTED_PARAGRAPH_TEMPLATE % ('onion9+.-://example.com', 'onion9+.-://example.com') + ), # Test various supported host variations. ( @@ -88,6 +96,10 @@ FIXTURE_POSITIVE_MATCHES = [ 'example.com', EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.com', 'example.com') ), + ( + 'example.horse', + EXPECTED_PARAGRAPH_TEMPLATE % ('http://example.horse', 'example.horse') + ), ( 'my.long.domain.example.com', EXPECTED_PARAGRAPH_TEMPLATE % ('http://my.long.domain.example.com', 'my.long.domain.example.com') @@ -196,6 +208,23 @@ FIXTURE_NEGATIVE_MATCHES = [ '

1.2.3.4.5

', ), + # Invalid protocols. + ( + '9onion://example.com', + '

9onion://example.com

', + ), + ( + '-onion://example.com', + '

-onion://example.com

', + ), + ( + '+onion://example.com', + '

+onion://example.com

', + ), + ( + '.onion://example.com', + '

.onion://example.com

', + ), ] @@ -215,6 +244,10 @@ class TestUrlizeExtension: def test_url_with_non_matching_begin_and_end_ignored(self): assert self.md.convert('(example.com>') == "

%s

" % html.escape('(example.com>') assert self.md.convert('%s

" % html.escape('(example.com') + assert self.md.convert('example.com)') == "

%s

" % html.escape('example.com)') + assert self.md.convert('') == "

%s

" % html.escape('example.com>') def test_makeExtension_return_value(): -- 2.45.2