Bradley Kirton's Blog

Published on March 29, 2024

Token highlighting for web search

This snippet wraps the tokens of the provided text in span tags and applies the font-bold tailwind class to matched tokens.

import dataclasses
import re

from django.utils.html import format_html

class Token:
    Models a token.

    token_type: str
    value: str

def highlight_matched_tokens(text: str, matches: set[str]) -> str:
    Apply highlighting to the matches found within the provided text.

    :param text: A string of text to be highlighted.
    :param matches: A set of tokens to highlight.
    :returns: The text with the appropriate styling applied.

    patterns = []
    for match in matches:
        pattern = (re.escape(match), lambda _, value: Token("match", value))

    patterns += [
        (r"\s+", lambda _, value: Token("whitespace", value)),
        (r"\S+", lambda _, value: Token("nonmatch", value)),

    scanner = re.Scanner(patterns, re.UNICODE + re.IGNORECASE)  # type: ignore
    tokens, _ = scanner.scan(text)

    formatted_text = ""
    for token in tokens:
        if token.token_type in ("whitespace", "nonmatch"):
            formatted_text += f"<span>{token.value}</span>"
            formatted_text += f'<span class="font-bold">{token.value}</span>'

    return format_html(formatted_text)