~netlandish/django-wiki

389b552c8bad8d8504127f2061aeb71f9d54af6b — Mathias Rav 6 years ago dd09cab
Tweak search result highlighting

- Preserve case of content rather than keyword

- Highlight entire word in case of partial word matching

- Use str.find(), str.index() and str.rindex() instead of regexes

- Don't insert superfluous spaces in snippet
2 files changed, 37 insertions(+), 27 deletions(-)

M src/wiki/templatetags/wiki_tags.py
M tests/core/test_template_filters.py
M src/wiki/templatetags/wiki_tags.py => src/wiki/templatetags/wiki_tags.py +21 -19
@@ 109,31 109,33 @@ def get_content_snippet(content, keyword, max_words=30):

        # remove html tags
        content = striptags(content)
        # remove newlines
        content = content.replace("\n", " ").split(" ")
        # remove whitespace
        words = content.split()

        return list(filter(lambda x: x != "", content))
        return words

    max_words = int(max_words)

    pattern = re.compile(
        r'(?P<before>.*)%s(?P<after>.*)' % re.escape(keyword),
        re.MULTILINE | re.IGNORECASE | re.DOTALL
    )
    match_position = content.lower().rfind(keyword.lower())

    match = pattern.search(content)

    if match:
        words = clean_text(match.group("before"))
        before_words = words[-max_words // 2:]
        words = clean_text(match.group("after"))

        after = " ".join(words[:max_words - len(before_words)])
    if match_position != -1:
        try:
            match_start = content.rindex(' ', 0, match_position) + 1
        except ValueError:
            match_start = 0
        try:
            match_end = content.index(' ', match_position + len(keyword))
        except ValueError:
            match_end = len(content)
        all_before = clean_text(content[:match_start])
        match = content[match_start:match_end]
        all_after = clean_text(content[match_end:])
        before_words = all_before[-max_words // 2:]
        after_words = all_after[:max_words - len(before_words)]
        before = " ".join(before_words)

        html = "%s %s %s" % (before, striptags(keyword), after)

        kw_p = re.compile(r'(%s)' % keyword, re.IGNORECASE)
        after = " ".join(after_words)
        html = ("%s %s %s" % (before, striptags(match), after)).strip()
        kw_p = re.compile(r'(\S*%s\S*)' % keyword, re.IGNORECASE)
        html = kw_p.sub(r"<strong>\1</strong>", html)

        return mark_safe(html)

M tests/core/test_template_filters.py => tests/core/test_template_filters.py +16 -8
@@ 19,7 19,7 @@ class GetContentSnippet(TemplateTestCase):
        content = text + ' list'
        expected = (
            'lorem lorem lorem lorem lorem lorem lorem lorem lorem '
            'lorem lorem lorem lorem lorem lorem <strong>list</strong> '
            'lorem lorem lorem lorem lorem lorem <strong>list</strong>'
        )

        output = get_content_snippet(content, 'list')


@@ 30,7 30,7 @@ class GetContentSnippet(TemplateTestCase):
        text = 'lorem ' * 80
        content = 'list ' + text
        expected = (
            ' <strong>list</strong> lorem lorem lorem lorem lorem '
            '<strong>list</strong> lorem lorem lorem lorem lorem '
            'lorem lorem lorem lorem lorem lorem lorem lorem lorem lorem lorem '
            'lorem lorem lorem lorem lorem lorem lorem lorem lorem lorem lorem '
            'lorem lorem lorem'


@@ 50,7 50,7 @@ class GetContentSnippet(TemplateTestCase):
            '<strong>lorem</strong> <strong>lorem</strong> '
            '<strong>lorem</strong> <strong>lorem</strong> '
            '<strong>lorem</strong> <strong>lorem</strong> '
            '<strong>lorem</strong> <strong>lorem</strong> '
            '<strong>lorem</strong> <strong>lorem</strong>'
        )

        output = get_content_snippet(content, 'lorem')


@@ 82,7 82,7 @@ class GetContentSnippet(TemplateTestCase):
        expected = (
            'dolorum dolorum dolorum dolorum dolorum dolorum dolorum '
            'dolorum dolorum dolorum dolorum dolorum dolorum dolorum dolorum '
            '<strong>list</strong> '
            '<strong>list</strong>'
        )

        output = get_content_snippet(content, 'list')


@@ 95,7 95,7 @@ class GetContentSnippet(TemplateTestCase):
        content = text + ' list'

        output = get_content_snippet(content, 'list', 0)
        expected = 'spam ' * 800 + '<strong>list</strong> '
        expected = 'spam ' * 800 + '<strong>list</strong>'

        self.assertEqual(output, expected)



@@ 105,7 105,7 @@ class GetContentSnippet(TemplateTestCase):
        content = text + ' list'

        output = get_content_snippet(content, 'list', -10)
        expected = 'spam ' * 75 + '<strong>list</strong> '
        expected = 'spam ' * 75 + '<strong>list</strong>'

        self.assertEqual(output, expected)



@@ 154,7 154,7 @@ class GetContentSnippet(TemplateTestCase):
        expected = (
            'I should citate Shakespeare or Byron. '
            'Or <strong>maybe</strong> copy paste from python '
            'or django documentation. <strong>maybe</strong> .'
            'or django documentation. <strong>Maybe.</strong>'
        )

        output = get_content_snippet(content, keyword, 30)


@@ 179,10 179,18 @@ class GetContentSnippet(TemplateTestCase):

        expected = (
            'knight <strong>eggs</strong> spam ham '
            '<strong>eggs</strong> guido python <strong>eggs</strong> '
            '<strong>eggs</strong> guido python <strong>eggs</strong>'
        )
        self.assertEqual(output, expected)

    def test_content_case_preserved(self):
        keyword = 'DOlOr'
        match = 'DoLoR'
        content = 'lorem ipsum %s sit amet' % match
        output = get_content_snippet(content, keyword)
        self.assertIn(match, output)
        self.assertNotIn(keyword, output)


class CanRead(TemplateTestCase):