137 lines
4.2 KiB
Python
137 lines
4.2 KiB
Python
"""
|
|
Utilities for extracting information from html.
|
|
"""
|
|
|
|
import re
|
|
from html import unescape
|
|
from typing import Optional
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from .lang import clean_lang
|
|
from .tag import drop_roles, drop_tags, keep_tags
|
|
|
|
re_ = {
|
|
'html_lang': re.compile(
|
|
'<html[^>]*lang\s*=\s*["\']([^"\']*)["\'][^>]*>', re.I | re.S
|
|
),
|
|
'title': re.compile('<title[^>]*>([^<]*)</title>', re.I | re.S),
|
|
'strip': re.compile(
|
|
'<(?!/?(' + '|'.join(keep_tags.keys()) + ')[ >])[^>]+>', re.I | re.S
|
|
),
|
|
'empty_tag': re.compile(r'<(?P<tag>\w+)( [^>]*)?>(\s*)</(?P=tag)>', re.S),
|
|
'whitespace': re.compile('(\s| )+', re.S),
|
|
'whitespace_': re.compile('\s| ?'), # allow broken  
|
|
'whitespace_near_tag': re.compile(
|
|
'\s*<(br|p|/p|ul|/ul|li|/li|h1|/h1'
|
|
'|h2|/h2|h3|/h3|h4|/h4|h5|/h5|h6|/h6)>\s*',
|
|
re.S,
|
|
),
|
|
'whitespace_tag_tag': re.compile('(\s+)((<[^>]+>\s+)+)', re.S),
|
|
'whitespace_tag_tag_func': re.compile('(<[^>]+>)\s+', re.S),
|
|
'http_equiv': re.compile('(<meta [^>]*http-equiv[^>]*>)', re.I | re.S),
|
|
}
|
|
|
|
|
|
def whitespace_tag_tag(match_obj):
|
|
"""
|
|
Helper function for removing whitespace between tags.
|
|
"""
|
|
return ' ' + re_['whitespace_tag_tag_func'].sub(r'\1', match_obj.group(2))
|
|
|
|
|
|
def clean_html(s: Optional[str]) -> Optional[str]:
|
|
"""
|
|
Clean an html string.
|
|
|
|
Unescape htmlentities and replace whitespaces with ' ' (ASCII char 0x20).
|
|
|
|
See also: https://www.lesinskis.com/python-unicode-whitespace.html
|
|
"""
|
|
return re_['whitespace_'].sub(' ', unescape(s)).strip() if s else None
|
|
|
|
|
|
def get_html_lang(html: str) -> Optional[str]:
|
|
"""
|
|
Return the language, if any, found in the lang attribute of the html tag.
|
|
"""
|
|
m = re_['html_lang'].search(html)
|
|
return clean_lang(m.group(1)) if m else None
|
|
|
|
|
|
def extract_title(html: str) -> Optional[str]:
|
|
"""
|
|
Extract title tags from html returning their content as a string.
|
|
"""
|
|
if not (titles := re_['title'].findall(html)):
|
|
return None
|
|
titles = [clean_html(title) for title in reversed(titles) if title]
|
|
return ' - '.join(titles).strip(' |')
|
|
|
|
|
|
def clean_page(html):
|
|
"""
|
|
Remove unwanted tags including their content from html.
|
|
|
|
Drop tags in *drop_tags* as well as tags with a role in *drop_roles*.
|
|
Also drop tags with attribute aria-hidden=true.
|
|
|
|
Return a beautiful soup.
|
|
"""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
for tag in drop_tags:
|
|
for n in soup.find_all(tag):
|
|
n.decompose()
|
|
for n in soup.find_all(attrs={'aria-hidden': 'true'}):
|
|
n.decompose()
|
|
for role in drop_roles:
|
|
for n in soup.find_all(attrs={'rel': role}):
|
|
n.decompose()
|
|
return soup
|
|
|
|
|
|
def clean_body(body):
|
|
"""
|
|
Clean an html body.
|
|
|
|
Remove unwanted tags (keeping their content); remove empty tags;
|
|
remove and replace whitespaces in several ways.
|
|
|
|
In the end the only whitespace is a space and there are no
|
|
consecutive spaces.
|
|
"""
|
|
body = re_['strip'].sub(' ', body)
|
|
body = re_['whitespace_near_tag'].sub(r'<\1>', body)
|
|
body = re_['whitespace'].sub(' ', body)
|
|
while re_['empty_tag'].search(body):
|
|
body = re_['empty_tag'].sub(r'\3', body)
|
|
body = re_['whitespace_near_tag'].sub(r'<\1>', body)
|
|
body = re_['whitespace'].sub(' ', body)
|
|
body = re_['whitespace_tag_tag'].sub(whitespace_tag_tag, body)
|
|
return body.strip().replace('\u00ad', '') # soft hyphen
|
|
|
|
|
|
def get_html_redirect(html: str) -> Optional[str]:
|
|
"""
|
|
Return an html redirect in an http-equiv meta tag.
|
|
|
|
If none is found, return None.
|
|
"""
|
|
redir_url = None
|
|
http_equivs = re_['http_equiv'].findall(html)
|
|
for raw in http_equivs:
|
|
tag = BeautifulSoup(raw, 'html.parser').meta
|
|
if tag and tag.get('http-equiv', '').lower() == 'refresh':
|
|
if content := tag.get('content'):
|
|
try:
|
|
_, redir_url = content.split(';')
|
|
redir_url = (
|
|
redir_url.strip()
|
|
.removeprefix('url=')
|
|
.removeprefix('URL=')
|
|
.strip("'")
|
|
)
|
|
except:
|
|
pass
|
|
return redir_url
|