From 389b552c8bad8d8504127f2061aeb71f9d54af6b Mon Sep 17 00:00:00 2001 From: Mathias Rav Date: Sat, 4 Aug 2018 20:59:43 +0200 Subject: [PATCH] Tweak search result highlighting - Preserve case of content rather than keyword - Highlight entire word in case of partial word matching - Use str.find(), str.index() and str.rindex() instead of regexes - Don't insert superfluous spaces in snippet --- src/wiki/templatetags/wiki_tags.py | 40 +++++++++++++++-------------- tests/core/test_template_filters.py | 24 +++++++++++------ 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/src/wiki/templatetags/wiki_tags.py b/src/wiki/templatetags/wiki_tags.py index 48151af8..c9cd8c81 100644 --- a/src/wiki/templatetags/wiki_tags.py +++ b/src/wiki/templatetags/wiki_tags.py @@ -109,31 +109,33 @@ def get_content_snippet(content, keyword, max_words=30): # remove html tags content = striptags(content) - # remove newlines - content = content.replace("\n", " ").split(" ") + # remove whitespace + words = content.split() - return list(filter(lambda x: x != "", content)) + return words max_words = int(max_words) - pattern = re.compile( - r'(?P.*)%s(?P.*)' % re.escape(keyword), - re.MULTILINE | re.IGNORECASE | re.DOTALL - ) + match_position = content.lower().rfind(keyword.lower()) - match = pattern.search(content) - - if match: - words = clean_text(match.group("before")) - before_words = words[-max_words // 2:] - words = clean_text(match.group("after")) - - after = " ".join(words[:max_words - len(before_words)]) + if match_position != -1: + try: + match_start = content.rindex(' ', 0, match_position) + 1 + except ValueError: + match_start = 0 + try: + match_end = content.index(' ', match_position + len(keyword)) + except ValueError: + match_end = len(content) + all_before = clean_text(content[:match_start]) + match = content[match_start:match_end] + all_after = clean_text(content[match_end:]) + before_words = all_before[-max_words // 2:] + after_words = all_after[:max_words - len(before_words)] before = " ".join(before_words) - - html = "%s %s %s" % (before, striptags(keyword), after) - - kw_p = re.compile(r'(%s)' % keyword, re.IGNORECASE) + after = " ".join(after_words) + html = ("%s %s %s" % (before, striptags(match), after)).strip() + kw_p = re.compile(r'(\S*%s\S*)' % keyword, re.IGNORECASE) html = kw_p.sub(r"\1", html) return mark_safe(html) diff --git a/tests/core/test_template_filters.py b/tests/core/test_template_filters.py index 9a2186f2..207b6108 100644 --- a/tests/core/test_template_filters.py +++ b/tests/core/test_template_filters.py @@ -19,7 +19,7 @@ class GetContentSnippet(TemplateTestCase): content = text + ' list' expected = ( 'lorem lorem lorem lorem lorem lorem lorem lorem lorem ' - 'lorem lorem lorem lorem lorem lorem list ' + 'lorem lorem lorem lorem lorem lorem list' ) output = get_content_snippet(content, 'list') @@ -30,7 +30,7 @@ class GetContentSnippet(TemplateTestCase): text = 'lorem ' * 80 content = 'list ' + text expected = ( - ' list lorem lorem lorem lorem lorem ' + 'list lorem lorem lorem lorem lorem ' 'lorem lorem lorem lorem lorem lorem lorem lorem lorem lorem lorem ' 'lorem lorem lorem lorem lorem lorem lorem lorem lorem lorem lorem ' 'lorem lorem lorem' @@ -50,7 +50,7 @@ class GetContentSnippet(TemplateTestCase): 'lorem lorem ' 'lorem lorem ' 'lorem lorem ' - 'lorem lorem ' + 'lorem lorem' ) output = get_content_snippet(content, 'lorem') @@ -82,7 +82,7 @@ class GetContentSnippet(TemplateTestCase): expected = ( 'dolorum dolorum dolorum dolorum dolorum dolorum dolorum ' 'dolorum dolorum dolorum dolorum dolorum dolorum dolorum dolorum ' - 'list ' + 'list' ) output = get_content_snippet(content, 'list') @@ -95,7 +95,7 @@ class GetContentSnippet(TemplateTestCase): content = text + ' list' output = get_content_snippet(content, 'list', 0) - expected = 'spam ' * 800 + 'list ' + expected = 'spam ' * 800 + 'list' self.assertEqual(output, expected) @@ -105,7 +105,7 @@ class GetContentSnippet(TemplateTestCase): content = text + ' list' output = get_content_snippet(content, 'list', -10) - expected = 'spam ' * 75 + 'list ' + expected = 'spam ' * 75 + 'list' self.assertEqual(output, expected) @@ -154,7 +154,7 @@ class GetContentSnippet(TemplateTestCase): expected = ( 'I should citate Shakespeare or Byron. ' 'Or maybe copy paste from python ' - 'or django documentation. maybe .' + 'or django documentation. Maybe.' ) output = get_content_snippet(content, keyword, 30) @@ -179,10 +179,18 @@ class GetContentSnippet(TemplateTestCase): expected = ( 'knight eggs spam ham ' - 'eggs guido python eggs ' + 'eggs guido python eggs' ) self.assertEqual(output, expected) + def test_content_case_preserved(self): + keyword = 'DOlOr' + match = 'DoLoR' + content = 'lorem ipsum %s sit amet' % match + output = get_content_snippet(content, keyword) + self.assertIn(match, output) + self.assertNotIn(keyword, output) + class CanRead(TemplateTestCase): -- 2.45.2