atextcrawler/src/atextcrawler/resource/page.py

356 lines
12 KiB
Python

"""
Parse HTML pages.
"""
import logging
from copy import deepcopy
from typing import Optional, Union
from bs4 import BeautifulSoup
from tidylib import tidy_document
from ..models import ResourceError, ResourceRedirect, Site, TextResource
from ..utils.annotation import (
annotate,
annotations_remove_section,
clean_annotations,
get_tag_counts,
headline_probability,
)
from ..utils.date_finder import extract_latest_date
from ..utils.durl import Durl, assort_links
from ..utils.html import (
clean_body,
clean_page,
extract_title,
get_html_lang,
get_html_redirect,
)
from ..utils.http import get_header_links
from ..utils.lang import extract_content_language
from ..utils.section import iter_sections
from ..utils.tag import keep_tags
logger = logging.getLogger(__name__)
logger_debug = logging.getLogger(__name__ + '.debug')
logger_debug.setLevel(logging.INFO)
logger_links = logging.getLogger(__name__ + '.debug.links')
logger_stats = logging.getLogger(__name__ + '.debug.stats')
logger_sections = logging.getLogger(__name__ + '.debug.sections')
async def parse_html(
durl: Durl,
resp: dict,
site: Optional[Site],
) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
"""
Extract relevant data from a response returning a TextResource instance.
The given URL must be the full URL (incl. scheme and netloc) of the page.
"""
html = resp['content']
# follow link to canonical URL
header_links = await get_header_links(resp['headers'], durl, site)
if canonical := header_links.get('canonical'):
if canonical != durl.url():
return ResourceRedirect(resp['redirects'] + [canonical])
# follow html redirect, if present
if redir_url := get_html_redirect(html):
if redir_url not in resp['redirects']:
return ResourceRedirect(resp['redirects'] + [redir_url])
else:
msg = f'Cyclic HTML redirect: {redir_url} in {resp["redirects"]}'
return ResourceError(msg)
# require html tag
if not html[:14].lower().startswith('<!doctype html'):
if '<html' not in html:
return None
# real URL after redirection
url = resp['redirects'][-1]
durl = await Durl(url)
if not durl:
return None
# page title
title = extract_title(html)
# tidy html
try:
html, _ = tidy_document(
html.encode('utf-8'),
options={
'logical-emphasis': 1,
'merge-divs': 1,
'merge-spans': 1,
'hide-comments': 1,
'output-bom': 0,
'show-errors': 0,
},
)
html = html.decode('utf-8')
except:
msg = f'Cannot tidy html from {url}'
return ResourceError(msg)
# drop irrelevant tags, including their contents
soup = clean_page(html)
# extract shortlink (from http headers or html head)
shortlink = header_links.get('shortlink')
if not shortlink and soup.head:
for link in soup.head.find_all('link'):
if 'shortlink' in link.get('rel', ''):
if link.get('href'):
shortlink = link.get('href')
break
# language, plaintext, annotations, last change
lang = get_html_lang(html)
html = clean_body(str(soup.body))
head = soup.head
text, annotations = annotate(html)
if lng := extract_content_language(text):
lang = lng
last_change = extract_latest_date(html, lang=lang)
# assort internal and external links
base_url = None
if head and head.base:
base_url = head.base.get('href')
if not base_url and site:
base_url = site.base_url
cleaned_links, links_int, links_ext = await assort_links(
annotations['links'], durl, text, base_url
)
annotations['links'] = cleaned_links
if logger_links.isEnabledFor(logging.DEBUG):
logger_links.debug('==== internal links')
for durl_, txt in links_int.items():
logger_links.debug(f'{durl_.url()} {txt}')
logger_links.debug('==== external links')
for durl_, txt in links_ext.items():
logger_links.debug(f'{durl_.url()} {txt}')
# keywords from category links
category_links = set()
for href, (i, f, rel) in annotations['links'].items():
if rel and ('category' in rel or 'tag' in rel):
category_links.add(text[i:f])
keywords = sorted(category_links)
# filter out irrelevant sections
filtered_text, filtered_ann = filter_sections(
text, annotations, site.boilerplate_texts if site else None
)
# debug statistics
if logger_stats.isEnabledFor(logging.DEBUG):
sb = annotations['semantic_breaks']
fsb = filtered_ann['semantic_breaks']
logger_stats.debug(
f'Page statistics:'
f' html_len={len(html)} text_len={len(filtered_text)}'
f' ratio={len(filtered_text) / len(html):.2f};'
f' sections={len(sb)} filtered_sections={len(fsb)}'
f' ratio={len(fsb) / len(sb):.2f} url={durl.url()}'
)
return TextResource(
content_type='html',
last_change=last_change,
text_len=len(text),
lang=lang,
title=title,
init_fields={
'durl': durl,
'site': site,
'headers': resp['headers'],
'redirects': resp['redirects'],
'links_int': links_int,
'links_ext': links_ext,
'shortlink': shortlink,
'canonical': True if canonical else None,
'head': head,
},
search_fields={
'title': title,
'pub_date': last_change,
'keywords': keywords,
'text': filtered_text,
'annotations': filtered_ann,
'head': str(head),
},
)
def filter_sections(text, annotations, boilerplate_texts):
"""
Filter out irrelevant sections using scores and factoring in neighbors.
"""
tags = annotations['tags']
sb = annotations['semantic_breaks']
section_ids = annotations['section_ids']
# for i1,f1 in sorted(tags.keys()):
# print(' ', i1,f1,tags[(i1,f1)], text[i1:f1])
# for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
# print('-' * lvl, i,f,','.join(tags[(i+1, f)]), sb[i], txt)
# print('_' * 50)
# from pprint import pprint
# pprint(sb)
# pprint(tags)
# pprint(section_ids)
# calculate keep scores for sections
# negative scores mean: drop; positive scores mean keep;
# scores between -2 and 2 are undecided
sections_keep = {}
headline_probs = {}
for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
if prob := headline_probability(txt, tags[(i, f)], lvl):
headline_probs[(i, f)] = prob
w = 0
n_chars = f - i - 1
# string length
w = (n_chars - 80) / 80 # initial weight
# punctuation
w += 0.4 * text.count('.') + 0.1 * text.count(',')
# p tag
if 'p' in tags[(i + 1, f)]: # prefer keeping paragraphs
w += 0.7
# links
n_links, link_density, avg_text_len = get_tag_counts(
('a',), i, f, tags, text
)
if link_density > 0.5:
w = -n_links
elif link_density > 0.3 and avg_text_len < 60:
w = -3
else:
n_li, li_density, li_len = get_tag_counts(
('li',), i, f, tags, text
)
if link_density > 0.2 and li_density > 0.8 and li_len < 50:
w = -3
if 52 <= lvl < 60:
w = max(w, 1.0)
if 'sidebar' in ' '.join(section_ids.get(i, [])):
w = -3
if len(txt) < 20 and ('RSS' in txt or 'MENU' in txt):
w = -3
# special chars
if txt.startswith('') or txt.endswith(''): # wordpress navigation
w = -3
# remove boilerplate texts
if boilerplate_texts and txt in boilerplate_texts:
w = -10
sections_keep[(i, f)] = w, lvl
# amend keep scores: look at preceding / subsequent sections with
# equal level and transfer their keep scores to the current section
n = len(sections_keep)
sections = list(sorted(sections_keep.keys()))
# inspect subsequent sections:
for rev_ind, s_range in enumerate(reversed(sections)):
ind = n - 1 - rev_ind
w, lvl = sections_keep[s_range]
if abs(w) <= 2:
w_sum = 0
n_peers = 0
for i in range(ind + 1, min(n, ind + 15)):
w_, lvl_ = sections_keep[sections[i]]
if lvl_ != lvl:
break
n_peers += 1
w_sum += w_
if n_peers >= 3:
sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
# inspect preceding sections:
for ind, s_range in enumerate(sections):
w, lvl = sections_keep[s_range]
if abs(w) <= 2:
w_sum = 0
n_peers = 0
for i in range(ind - 1, max(0, ind - 15), -1):
w_, lvl_ = sections_keep[sections[i]]
if lvl_ != lvl:
break
n_peers += 1
w_sum += w_
if n_peers >= 3:
sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
# amend keep scores: look at sections that could be headlines
# for subsequent kept sections and increase their score;
# also allow for up to 2 sections inbetween (which will also
# have their score increased)
for rev_ind, s_range in enumerate(reversed(sections)):
ind = n - 1 - rev_ind
w, lvl = sections_keep[s_range]
if abs(w) <= 2:
if headline_probs.get(s_range, 0) > 0.49:
# look at subsequent sections with higher level
child_weights = []
for i in range(ind + 1, n):
w_, lvl_ = sections_keep[sections[i]]
if lvl_ <= lvl or w_ < -2:
break
child_weights.append(w_)
if nc := len(child_weights):
child_avg = sum(child_weights) / nc
if w + 1.2 * child_avg > 2:
sections_keep[s_range] = w + 1.2 * child_avg, lvl
if nc > 1:
if (w1 := child_weights[0]) <= 2:
sections_keep[sections[ind + 1]] = (
w1 + 1.5 * child_avg,
lvl,
)
if nc > 2:
if (w2 := child_weights[1]) <= 2:
sections_keep[sections[ind + 2]] = (
w2 + 2 * child_avg,
lvl,
)
# clean annotations
clean_annotations(annotations)
# debug sections
if logger_sections.isEnabledFor(logging.DEBUG):
logger_sections.debug('============= Weighted sections =============')
for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
w, lvl = sections_keep[(i, f)]
indent = ('+' if w > 2 else '-') * lvl
ts = ','.join(tags[(i + 1, f)])
logger_sections.debug(f'{indent} {i} {f} {ts} {txt} {w:.2f}')
# narrow down annotations and text to keep_sections
# drop undecided sections
filtered_text = text
filtered_ann = deepcopy(annotations)
for i, f in sorted(sections_keep.keys(), reverse=True):
w, lvl = sections_keep[(i, f)]
if w <= 2.0:
filtered_ann = annotations_remove_section(filtered_ann, i, f)
filtered_text = filtered_text[:i] + filtered_text[f:]
clean_annotations(filtered_ann)
# debug filtered sections
if logger_sections.isEnabledFor(logging.DEBUG):
logger_sections.debug('')
logger_sections.debug('============= Filtered sections =============')
fsb = filtered_ann['semantic_breaks']
ftags = filtered_ann['tags']
for i, f, lvl, txt in iter_sections(filtered_text, fsb, max_level=100):
indent = ' ' * lvl
ts = ','.join(ftags.get((i + 1, f), []))
logger_sections.debug(f'{indent} {lvl} {i} {f} {ts} {txt}')
return filtered_text, filtered_ann