
137 lines
4.2 KiB

Utilities for extracting information from html.
import re
from html import unescape
from typing import Optional
from bs4 import BeautifulSoup
from .lang import clean_lang
from .tag import drop_roles, drop_tags, keep_tags
re_ = {
'html_lang': re.compile(
'<html[^>]*lang\s*=\s*["\']([^"\']*)["\'][^>]*>', re.I | re.S
'title': re.compile('<title[^>]*>([^<]*)</title>', re.I | re.S),
'strip': re.compile(
'<(?!/?(' + '|'.join(keep_tags.keys()) + ')[ >])[^>]+>', re.I | re.S
'empty_tag': re.compile(r'<(?P<tag>\w+)( [^>]*)?>(\s*)</(?P=tag)>', re.S),
'whitespace': re.compile('(\s|&nbsp;)+', re.S),
'whitespace_': re.compile('\s|&nbsp;?'), # allow broken &nbsp
'whitespace_near_tag': re.compile(
'whitespace_tag_tag': re.compile('(\s+)((<[^>]+>\s+)+)', re.S),
'whitespace_tag_tag_func': re.compile('(<[^>]+>)\s+', re.S),
'http_equiv': re.compile('(<meta [^>]*http-equiv[^>]*>)', re.I | re.S),
def whitespace_tag_tag(match_obj):
Helper function for removing whitespace between tags.
return ' ' + re_['whitespace_tag_tag_func'].sub(r'\1',
def clean_html(s: Optional[str]) -> Optional[str]:
Clean an html string.
Unescape htmlentities and replace whitespaces with ' ' (ASCII char 0x20).
See also:
return re_['whitespace_'].sub(' ', unescape(s)).strip() if s else None
def get_html_lang(html: str) -> Optional[str]:
Return the language, if any, found in the lang attribute of the html tag.
m = re_['html_lang'].search(html)
return clean_lang( if m else None
def extract_title(html: str) -> Optional[str]:
Extract title tags from html returning their content as a string.
if not (titles := re_['title'].findall(html)):
return None
titles = [clean_html(title) for title in reversed(titles) if title]
return ' - '.join(titles).strip(' |')
def clean_page(html):
Remove unwanted tags including their content from html.
Drop tags in *drop_tags* as well as tags with a role in *drop_roles*.
Also drop tags with attribute aria-hidden=true.
Return a beautiful soup.
soup = BeautifulSoup(html, 'html.parser')
for tag in drop_tags:
for n in soup.find_all(tag):
for n in soup.find_all(attrs={'aria-hidden': 'true'}):
for role in drop_roles:
for n in soup.find_all(attrs={'rel': role}):
return soup
def clean_body(body):
Clean an html body.
Remove unwanted tags (keeping their content); remove empty tags;
remove and replace whitespaces in several ways.
In the end the only whitespace is a space and there are no
consecutive spaces.
body = re_['strip'].sub(' ', body)
body = re_['whitespace_near_tag'].sub(r'<\1>', body)
body = re_['whitespace'].sub(' ', body)
while re_['empty_tag'].search(body):
body = re_['empty_tag'].sub(r'\3', body)
body = re_['whitespace_near_tag'].sub(r'<\1>', body)
body = re_['whitespace'].sub(' ', body)
body = re_['whitespace_tag_tag'].sub(whitespace_tag_tag, body)
return body.strip().replace('\u00ad', '') # soft hyphen
def get_html_redirect(html: str) -> Optional[str]:
Return an html redirect in an http-equiv meta tag.
If none is found, return None.
redir_url = None
http_equivs = re_['http_equiv'].findall(html)
for raw in http_equivs:
tag = BeautifulSoup(raw, 'html.parser').meta
if tag and tag.get('http-equiv', '').lower() == 'refresh':
if content := tag.get('content'):
_, redir_url = content.split(';')
redir_url = (
return redir_url