atextcrawler/src/atextcrawler/utils/html.py

137 lines
4.2 KiB
Python

"""
Utilities for extracting information from html.
"""
import re
from html import unescape
from typing import Optional
from bs4 import BeautifulSoup
from .lang import clean_lang
from .tag import drop_roles, drop_tags, keep_tags
re_ = {
'html_lang': re.compile(
'<html[^>]*lang\s*=\s*["\']([^"\']*)["\'][^>]*>', re.I | re.S
),
'title': re.compile('<title[^>]*>([^<]*)</title>', re.I | re.S),
'strip': re.compile(
'<(?!/?(' + '|'.join(keep_tags.keys()) + ')[ >])[^>]+>', re.I | re.S
),
'empty_tag': re.compile(r'<(?P<tag>\w+)( [^>]*)?>(\s*)</(?P=tag)>', re.S),
'whitespace': re.compile('(\s|&nbsp;)+', re.S),
'whitespace_': re.compile('\s|&nbsp;?'), # allow broken &nbsp
'whitespace_near_tag': re.compile(
'\s*<(br|p|/p|ul|/ul|li|/li|h1|/h1'
'|h2|/h2|h3|/h3|h4|/h4|h5|/h5|h6|/h6)>\s*',
re.S,
),
'whitespace_tag_tag': re.compile('(\s+)((<[^>]+>\s+)+)', re.S),
'whitespace_tag_tag_func': re.compile('(<[^>]+>)\s+', re.S),
'http_equiv': re.compile('(<meta [^>]*http-equiv[^>]*>)', re.I | re.S),
}
def whitespace_tag_tag(match_obj):
"""
Helper function for removing whitespace between tags.
"""
return ' ' + re_['whitespace_tag_tag_func'].sub(r'\1', match_obj.group(2))
def clean_html(s: Optional[str]) -> Optional[str]:
"""
Clean an html string.
Unescape htmlentities and replace whitespaces with ' ' (ASCII char 0x20).
See also: https://www.lesinskis.com/python-unicode-whitespace.html
"""
return re_['whitespace_'].sub(' ', unescape(s)).strip() if s else None
def get_html_lang(html: str) -> Optional[str]:
"""
Return the language, if any, found in the lang attribute of the html tag.
"""
m = re_['html_lang'].search(html)
return clean_lang(m.group(1)) if m else None
def extract_title(html: str) -> Optional[str]:
"""
Extract title tags from html returning their content as a string.
"""
if not (titles := re_['title'].findall(html)):
return None
titles = [clean_html(title) for title in reversed(titles) if title]
return ' - '.join(titles).strip(' |')
def clean_page(html):
"""
Remove unwanted tags including their content from html.
Drop tags in *drop_tags* as well as tags with a role in *drop_roles*.
Also drop tags with attribute aria-hidden=true.
Return a beautiful soup.
"""
soup = BeautifulSoup(html, 'html.parser')
for tag in drop_tags:
for n in soup.find_all(tag):
n.decompose()
for n in soup.find_all(attrs={'aria-hidden': 'true'}):
n.decompose()
for role in drop_roles:
for n in soup.find_all(attrs={'rel': role}):
n.decompose()
return soup
def clean_body(body):
"""
Clean an html body.
Remove unwanted tags (keeping their content); remove empty tags;
remove and replace whitespaces in several ways.
In the end the only whitespace is a space and there are no
consecutive spaces.
"""
body = re_['strip'].sub(' ', body)
body = re_['whitespace_near_tag'].sub(r'<\1>', body)
body = re_['whitespace'].sub(' ', body)
while re_['empty_tag'].search(body):
body = re_['empty_tag'].sub(r'\3', body)
body = re_['whitespace_near_tag'].sub(r'<\1>', body)
body = re_['whitespace'].sub(' ', body)
body = re_['whitespace_tag_tag'].sub(whitespace_tag_tag, body)
return body.strip().replace('\u00ad', '') # soft hyphen
def get_html_redirect(html: str) -> Optional[str]:
"""
Return an html redirect in an http-equiv meta tag.
If none is found, return None.
"""
redir_url = None
http_equivs = re_['http_equiv'].findall(html)
for raw in http_equivs:
tag = BeautifulSoup(raw, 'html.parser').meta
if tag and tag.get('http-equiv', '').lower() == 'refresh':
if content := tag.get('content'):
try:
_, redir_url = content.split(';')
redir_url = (
redir_url.strip()
.removeprefix('url=')
.removeprefix('URL=')
.strip("'")
)
except:
pass
return redir_url