atextcrawler/src/atextcrawler/utils/html.py

"""
Utilities for extracting information from html.
"""

import re
from html import unescape
from typing import Optional

from bs4 import BeautifulSoup

from .lang import clean_lang
from .tag import drop_roles, drop_tags, keep_tags

re_ = {
    'html_lang': re.compile(
        '<html[^>]*lang\s*=\s*["\']([^"\']*)["\'][^>]*>', re.I | re.S
    ),
    'title': re.compile('<title[^>]*>([^<]*)</title>', re.I | re.S),
    'strip': re.compile(
        '<(?!/?(' + '|'.join(keep_tags.keys()) + ')[ >])[^>]+>', re.I | re.S
    ),
    'empty_tag': re.compile(r'<(?P<tag>\w+)( [^>]*)?>(\s*)</(?P=tag)>', re.S),
    'whitespace': re.compile('(\s|&nbsp;)+', re.S),
    'whitespace_': re.compile('\s|&nbsp;?'),  # allow broken &nbsp
    'whitespace_near_tag': re.compile(
        '\s*<(br|p|/p|ul|/ul|li|/li|h1|/h1'
        '|h2|/h2|h3|/h3|h4|/h4|h5|/h5|h6|/h6)>\s*',
        re.S,
    ),
    'whitespace_tag_tag': re.compile('(\s+)((<[^>]+>\s+)+)', re.S),
    'whitespace_tag_tag_func': re.compile('(<[^>]+>)\s+', re.S),
    'http_equiv': re.compile('(<meta [^>]*http-equiv[^>]*>)', re.I | re.S),
}


def whitespace_tag_tag(match_obj):
    """
    Helper function for removing whitespace between tags.
    """
    return ' ' + re_['whitespace_tag_tag_func'].sub(r'\1', match_obj.group(2))


def clean_html(s: Optional[str]) -> Optional[str]:
    """
    Clean an html string.

    Unescape htmlentities and replace whitespaces with ' ' (ASCII char 0x20).

    See also: https://www.lesinskis.com/python-unicode-whitespace.html
    """
    return re_['whitespace_'].sub(' ', unescape(s)).strip() if s else None


def get_html_lang(html: str) -> Optional[str]:
    """
    Return the language, if any, found in the lang attribute of the html tag.
    """
    m = re_['html_lang'].search(html)
    return clean_lang(m.group(1)) if m else None


def extract_title(html: str) -> Optional[str]:
    """
    Extract title tags from html returning their content as a string.
    """
    if not (titles := re_['title'].findall(html)):
        return None
    titles = [clean_html(title) for title in reversed(titles) if title]
    return ' - '.join(titles).strip(' |')


def clean_page(html):
    """
    Remove unwanted tags including their content from html.

    Drop tags in *drop_tags* as well as tags with a role in *drop_roles*.
    Also drop tags with attribute aria-hidden=true.

    Return a beautiful soup.
    """
    soup = BeautifulSoup(html, 'html.parser')
    for tag in drop_tags:
        for n in soup.find_all(tag):
            n.decompose()
    for n in soup.find_all(attrs={'aria-hidden': 'true'}):
        n.decompose()
    for role in drop_roles:
        for n in soup.find_all(attrs={'rel': role}):
            n.decompose()
    return soup


def clean_body(body):
    """
    Clean an html body.

    Remove unwanted tags (keeping their content); remove empty tags;
    remove and replace whitespaces in several ways.

    In the end the only whitespace is a space and there are no
    consecutive spaces.
    """
    body = re_['strip'].sub(' ', body)
    body = re_['whitespace_near_tag'].sub(r'<\1>', body)
    body = re_['whitespace'].sub(' ', body)
    while re_['empty_tag'].search(body):
        body = re_['empty_tag'].sub(r'\3', body)
    body = re_['whitespace_near_tag'].sub(r'<\1>', body)
    body = re_['whitespace'].sub(' ', body)
    body = re_['whitespace_tag_tag'].sub(whitespace_tag_tag, body)
    return body.strip().replace('\u00ad', '')  # soft hyphen


def get_html_redirect(html: str) -> Optional[str]:
    """
    Return an html redirect in an http-equiv meta tag.

    If none is found, return None.
    """
    redir_url = None
    http_equivs = re_['http_equiv'].findall(html)
    for raw in http_equivs:
        tag = BeautifulSoup(raw, 'html.parser').meta
        if tag and tag.get('http-equiv', '').lower() == 'refresh':
            if content := tag.get('content'):
                try:
                    _, redir_url = content.split(';')
                    redir_url = (
                        redir_url.strip()
                        .removeprefix('url=')
                        .removeprefix('URL=')
                        .strip("'")
                    )
                except:
                    pass
    return redir_url