356 lines
12 KiB
Python
356 lines
12 KiB
Python
"""
|
|
Parse HTML pages.
|
|
"""
|
|
|
|
import logging
|
|
from copy import deepcopy
|
|
from typing import Optional, Union
|
|
|
|
from bs4 import BeautifulSoup
|
|
from tidylib import tidy_document
|
|
|
|
from ..models import ResourceError, ResourceRedirect, Site, TextResource
|
|
from ..utils.annotation import (
|
|
annotate,
|
|
annotations_remove_section,
|
|
clean_annotations,
|
|
get_tag_counts,
|
|
headline_probability,
|
|
)
|
|
from ..utils.date_finder import extract_latest_date
|
|
from ..utils.durl import Durl, assort_links
|
|
from ..utils.html import (
|
|
clean_body,
|
|
clean_page,
|
|
extract_title,
|
|
get_html_lang,
|
|
get_html_redirect,
|
|
)
|
|
from ..utils.http import get_header_links
|
|
from ..utils.lang import extract_content_language
|
|
from ..utils.section import iter_sections
|
|
from ..utils.tag import keep_tags
|
|
|
|
logger = logging.getLogger(__name__)
|
|
logger_debug = logging.getLogger(__name__ + '.debug')
|
|
logger_debug.setLevel(logging.INFO)
|
|
logger_links = logging.getLogger(__name__ + '.debug.links')
|
|
logger_stats = logging.getLogger(__name__ + '.debug.stats')
|
|
logger_sections = logging.getLogger(__name__ + '.debug.sections')
|
|
|
|
|
|
async def parse_html(
|
|
durl: Durl,
|
|
resp: dict,
|
|
site: Optional[Site],
|
|
) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
|
|
"""
|
|
Extract relevant data from a response returning a TextResource instance.
|
|
|
|
The given URL must be the full URL (incl. scheme and netloc) of the page.
|
|
"""
|
|
html = resp['content']
|
|
|
|
# follow link to canonical URL
|
|
header_links = await get_header_links(resp['headers'], durl, site)
|
|
if canonical := header_links.get('canonical'):
|
|
if canonical != durl.url():
|
|
return ResourceRedirect(resp['redirects'] + [canonical])
|
|
|
|
# follow html redirect, if present
|
|
if redir_url := get_html_redirect(html):
|
|
if redir_url not in resp['redirects']:
|
|
return ResourceRedirect(resp['redirects'] + [redir_url])
|
|
else:
|
|
msg = f'Cyclic HTML redirect: {redir_url} in {resp["redirects"]}'
|
|
return ResourceError(msg)
|
|
|
|
# require html tag
|
|
if not html[:14].lower().startswith('<!doctype html'):
|
|
if '<html' not in html:
|
|
return None
|
|
|
|
# real URL after redirection
|
|
url = resp['redirects'][-1]
|
|
durl = await Durl(url)
|
|
if not durl:
|
|
return None
|
|
|
|
# page title
|
|
title = extract_title(html)
|
|
|
|
# tidy html
|
|
try:
|
|
html, _ = tidy_document(
|
|
html.encode('utf-8'),
|
|
options={
|
|
'logical-emphasis': 1,
|
|
'merge-divs': 1,
|
|
'merge-spans': 1,
|
|
'hide-comments': 1,
|
|
'output-bom': 0,
|
|
'show-errors': 0,
|
|
},
|
|
)
|
|
html = html.decode('utf-8')
|
|
except:
|
|
msg = f'Cannot tidy html from {url}'
|
|
return ResourceError(msg)
|
|
|
|
# drop irrelevant tags, including their contents
|
|
soup = clean_page(html)
|
|
|
|
# extract shortlink (from http headers or html head)
|
|
shortlink = header_links.get('shortlink')
|
|
if not shortlink and soup.head:
|
|
for link in soup.head.find_all('link'):
|
|
if 'shortlink' in link.get('rel', ''):
|
|
if link.get('href'):
|
|
shortlink = link.get('href')
|
|
break
|
|
|
|
# language, plaintext, annotations, last change
|
|
lang = get_html_lang(html)
|
|
html = clean_body(str(soup.body))
|
|
head = soup.head
|
|
text, annotations = annotate(html)
|
|
if lng := extract_content_language(text):
|
|
lang = lng
|
|
last_change = extract_latest_date(html, lang=lang)
|
|
|
|
# assort internal and external links
|
|
base_url = None
|
|
if head and head.base:
|
|
base_url = head.base.get('href')
|
|
if not base_url and site:
|
|
base_url = site.base_url
|
|
cleaned_links, links_int, links_ext = await assort_links(
|
|
annotations['links'], durl, text, base_url
|
|
)
|
|
annotations['links'] = cleaned_links
|
|
if logger_links.isEnabledFor(logging.DEBUG):
|
|
logger_links.debug('==== internal links')
|
|
for durl_, txt in links_int.items():
|
|
logger_links.debug(f'{durl_.url()} {txt}')
|
|
logger_links.debug('==== external links')
|
|
for durl_, txt in links_ext.items():
|
|
logger_links.debug(f'{durl_.url()} {txt}')
|
|
|
|
# keywords from category links
|
|
category_links = set()
|
|
for href, (i, f, rel) in annotations['links'].items():
|
|
if rel and ('category' in rel or 'tag' in rel):
|
|
category_links.add(text[i:f])
|
|
keywords = sorted(category_links)
|
|
|
|
# filter out irrelevant sections
|
|
filtered_text, filtered_ann = filter_sections(
|
|
text, annotations, site.boilerplate_texts if site else None
|
|
)
|
|
|
|
# debug statistics
|
|
if logger_stats.isEnabledFor(logging.DEBUG):
|
|
sb = annotations['semantic_breaks']
|
|
fsb = filtered_ann['semantic_breaks']
|
|
logger_stats.debug(
|
|
f'Page statistics:'
|
|
f' html_len={len(html)} text_len={len(filtered_text)}'
|
|
f' ratio={len(filtered_text) / len(html):.2f};'
|
|
f' sections={len(sb)} filtered_sections={len(fsb)}'
|
|
f' ratio={len(fsb) / len(sb):.2f} url={durl.url()}'
|
|
)
|
|
|
|
return TextResource(
|
|
content_type='html',
|
|
last_change=last_change,
|
|
text_len=len(text),
|
|
lang=lang,
|
|
title=title,
|
|
init_fields={
|
|
'durl': durl,
|
|
'site': site,
|
|
'headers': resp['headers'],
|
|
'redirects': resp['redirects'],
|
|
'links_int': links_int,
|
|
'links_ext': links_ext,
|
|
'shortlink': shortlink,
|
|
'canonical': True if canonical else None,
|
|
'head': head,
|
|
},
|
|
search_fields={
|
|
'title': title,
|
|
'pub_date': last_change,
|
|
'keywords': keywords,
|
|
'text': filtered_text,
|
|
'annotations': filtered_ann,
|
|
'head': str(head),
|
|
},
|
|
)
|
|
|
|
|
|
def filter_sections(text, annotations, boilerplate_texts):
|
|
"""
|
|
Filter out irrelevant sections using scores and factoring in neighbors.
|
|
"""
|
|
tags = annotations['tags']
|
|
sb = annotations['semantic_breaks']
|
|
section_ids = annotations['section_ids']
|
|
|
|
# for i1,f1 in sorted(tags.keys()):
|
|
# print(' ', i1,f1,tags[(i1,f1)], text[i1:f1])
|
|
# for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
|
|
# print('-' * lvl, i,f,','.join(tags[(i+1, f)]), sb[i], txt)
|
|
# print('_' * 50)
|
|
# from pprint import pprint
|
|
# pprint(sb)
|
|
# pprint(tags)
|
|
# pprint(section_ids)
|
|
|
|
# calculate keep scores for sections
|
|
# negative scores mean: drop; positive scores mean keep;
|
|
# scores between -2 and 2 are undecided
|
|
sections_keep = {}
|
|
headline_probs = {}
|
|
for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
|
|
if prob := headline_probability(txt, tags[(i, f)], lvl):
|
|
headline_probs[(i, f)] = prob
|
|
w = 0
|
|
n_chars = f - i - 1
|
|
# string length
|
|
w = (n_chars - 80) / 80 # initial weight
|
|
# punctuation
|
|
w += 0.4 * text.count('.') + 0.1 * text.count(',')
|
|
# p tag
|
|
if 'p' in tags[(i + 1, f)]: # prefer keeping paragraphs
|
|
w += 0.7
|
|
# links
|
|
n_links, link_density, avg_text_len = get_tag_counts(
|
|
('a',), i, f, tags, text
|
|
)
|
|
if link_density > 0.5:
|
|
w = -n_links
|
|
elif link_density > 0.3 and avg_text_len < 60:
|
|
w = -3
|
|
else:
|
|
n_li, li_density, li_len = get_tag_counts(
|
|
('li',), i, f, tags, text
|
|
)
|
|
if link_density > 0.2 and li_density > 0.8 and li_len < 50:
|
|
w = -3
|
|
if 52 <= lvl < 60:
|
|
w = max(w, 1.0)
|
|
if 'sidebar' in ' '.join(section_ids.get(i, [])):
|
|
w = -3
|
|
if len(txt) < 20 and ('RSS' in txt or 'MENU' in txt):
|
|
w = -3
|
|
# special chars
|
|
if txt.startswith('←') or txt.endswith('→'): # wordpress navigation
|
|
w = -3
|
|
# remove boilerplate texts
|
|
if boilerplate_texts and txt in boilerplate_texts:
|
|
w = -10
|
|
sections_keep[(i, f)] = w, lvl
|
|
|
|
# amend keep scores: look at preceding / subsequent sections with
|
|
# equal level and transfer their keep scores to the current section
|
|
n = len(sections_keep)
|
|
sections = list(sorted(sections_keep.keys()))
|
|
# inspect subsequent sections:
|
|
for rev_ind, s_range in enumerate(reversed(sections)):
|
|
ind = n - 1 - rev_ind
|
|
w, lvl = sections_keep[s_range]
|
|
if abs(w) <= 2:
|
|
w_sum = 0
|
|
n_peers = 0
|
|
for i in range(ind + 1, min(n, ind + 15)):
|
|
w_, lvl_ = sections_keep[sections[i]]
|
|
if lvl_ != lvl:
|
|
break
|
|
n_peers += 1
|
|
w_sum += w_
|
|
if n_peers >= 3:
|
|
sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
|
|
# inspect preceding sections:
|
|
for ind, s_range in enumerate(sections):
|
|
w, lvl = sections_keep[s_range]
|
|
if abs(w) <= 2:
|
|
w_sum = 0
|
|
n_peers = 0
|
|
for i in range(ind - 1, max(0, ind - 15), -1):
|
|
w_, lvl_ = sections_keep[sections[i]]
|
|
if lvl_ != lvl:
|
|
break
|
|
n_peers += 1
|
|
w_sum += w_
|
|
if n_peers >= 3:
|
|
sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
|
|
|
|
# amend keep scores: look at sections that could be headlines
|
|
# for subsequent kept sections and increase their score;
|
|
# also allow for up to 2 sections inbetween (which will also
|
|
# have their score increased)
|
|
for rev_ind, s_range in enumerate(reversed(sections)):
|
|
ind = n - 1 - rev_ind
|
|
w, lvl = sections_keep[s_range]
|
|
if abs(w) <= 2:
|
|
if headline_probs.get(s_range, 0) > 0.49:
|
|
# look at subsequent sections with higher level
|
|
child_weights = []
|
|
for i in range(ind + 1, n):
|
|
w_, lvl_ = sections_keep[sections[i]]
|
|
if lvl_ <= lvl or w_ < -2:
|
|
break
|
|
child_weights.append(w_)
|
|
if nc := len(child_weights):
|
|
child_avg = sum(child_weights) / nc
|
|
if w + 1.2 * child_avg > 2:
|
|
sections_keep[s_range] = w + 1.2 * child_avg, lvl
|
|
if nc > 1:
|
|
if (w1 := child_weights[0]) <= 2:
|
|
sections_keep[sections[ind + 1]] = (
|
|
w1 + 1.5 * child_avg,
|
|
lvl,
|
|
)
|
|
if nc > 2:
|
|
if (w2 := child_weights[1]) <= 2:
|
|
sections_keep[sections[ind + 2]] = (
|
|
w2 + 2 * child_avg,
|
|
lvl,
|
|
)
|
|
|
|
# clean annotations
|
|
clean_annotations(annotations)
|
|
|
|
# debug sections
|
|
if logger_sections.isEnabledFor(logging.DEBUG):
|
|
logger_sections.debug('============= Weighted sections =============')
|
|
for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
|
|
w, lvl = sections_keep[(i, f)]
|
|
indent = ('+' if w > 2 else '-') * lvl
|
|
ts = ','.join(tags[(i + 1, f)])
|
|
logger_sections.debug(f'{indent} {i} {f} {ts} {txt} {w:.2f}')
|
|
|
|
# narrow down annotations and text to keep_sections
|
|
# drop undecided sections
|
|
filtered_text = text
|
|
filtered_ann = deepcopy(annotations)
|
|
for i, f in sorted(sections_keep.keys(), reverse=True):
|
|
w, lvl = sections_keep[(i, f)]
|
|
if w <= 2.0:
|
|
filtered_ann = annotations_remove_section(filtered_ann, i, f)
|
|
filtered_text = filtered_text[:i] + filtered_text[f:]
|
|
clean_annotations(filtered_ann)
|
|
|
|
# debug filtered sections
|
|
if logger_sections.isEnabledFor(logging.DEBUG):
|
|
logger_sections.debug('')
|
|
logger_sections.debug('============= Filtered sections =============')
|
|
fsb = filtered_ann['semantic_breaks']
|
|
ftags = filtered_ann['tags']
|
|
for i, f, lvl, txt in iter_sections(filtered_text, fsb, max_level=100):
|
|
indent = ' ' * lvl
|
|
ts = ','.join(ftags.get((i + 1, f), []))
|
|
logger_sections.debug(f'{indent} {lvl} {i} {f} {ts} {txt}')
|
|
|
|
return filtered_text, filtered_ann
|