atextcrawler/src/atextcrawler/resource/page.py

"""
Parse HTML pages.
"""

import logging
from copy import deepcopy
from typing import Optional, Union

from bs4 import BeautifulSoup
from tidylib import tidy_document

from ..models import ResourceError, ResourceRedirect, Site, TextResource
from ..utils.annotation import (
    annotate,
    annotations_remove_section,
    clean_annotations,
    get_tag_counts,
    headline_probability,
)
from ..utils.date_finder import extract_latest_date
from ..utils.durl import Durl, assort_links
from ..utils.html import (
    clean_body,
    clean_page,
    extract_title,
    get_html_lang,
    get_html_redirect,
)
from ..utils.http import get_header_links
from ..utils.lang import extract_content_language
from ..utils.section import iter_sections
from ..utils.tag import keep_tags

logger = logging.getLogger(__name__)
logger_debug = logging.getLogger(__name__ + '.debug')
logger_debug.setLevel(logging.INFO)
logger_links = logging.getLogger(__name__ + '.debug.links')
logger_stats = logging.getLogger(__name__ + '.debug.stats')
logger_sections = logging.getLogger(__name__ + '.debug.sections')


async def parse_html(
    durl: Durl,
    resp: dict,
    site: Optional[Site],
) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
    """
    Extract relevant data from a response returning a TextResource instance.

    The given URL must be the full URL (incl. scheme and netloc) of the page.
    """
    html = resp['content']

    # follow link to canonical URL
    header_links = await get_header_links(resp['headers'], durl, site)
    if canonical := header_links.get('canonical'):
        if canonical != durl.url():
            return ResourceRedirect(resp['redirects'] + [canonical])

    # follow html redirect, if present
    if redir_url := get_html_redirect(html):
        if redir_url not in resp['redirects']:
            return ResourceRedirect(resp['redirects'] + [redir_url])
        else:
            msg = f'Cyclic HTML redirect: {redir_url} in {resp["redirects"]}'
            return ResourceError(msg)

    # require html tag
    if not html[:14].lower().startswith('<!doctype html'):
        if '<html' not in html:
            return None

    # real URL after redirection
    url = resp['redirects'][-1]
    durl = await Durl(url)
    if not durl:
        return None

    # page title
    title = extract_title(html)

    # tidy html
    try:
        html, _ = tidy_document(
            html.encode('utf-8'),
            options={
                'logical-emphasis': 1,
                'merge-divs': 1,
                'merge-spans': 1,
                'hide-comments': 1,
                'output-bom': 0,
                'show-errors': 0,
            },
        )
        html = html.decode('utf-8')
    except:
        msg = f'Cannot tidy html from {url}'
        return ResourceError(msg)

    # drop irrelevant tags, including their contents
    soup = clean_page(html)

    # extract shortlink (from http headers or html head)
    shortlink = header_links.get('shortlink')
    if not shortlink and soup.head:
        for link in soup.head.find_all('link'):
            if 'shortlink' in link.get('rel', ''):
                if link.get('href'):
                    shortlink = link.get('href')
                    break

    # language, plaintext, annotations, last change
    lang = get_html_lang(html)
    html = clean_body(str(soup.body))
    head = soup.head
    text, annotations = annotate(html)
    if lng := extract_content_language(text):
        lang = lng
    last_change = extract_latest_date(html, lang=lang)

    # assort internal and external links
    base_url = None
    if head and head.base:
        base_url = head.base.get('href')
    if not base_url and site:
        base_url = site.base_url
    cleaned_links, links_int, links_ext = await assort_links(
        annotations['links'], durl, text, base_url
    )
    annotations['links'] = cleaned_links
    if logger_links.isEnabledFor(logging.DEBUG):
        logger_links.debug('==== internal links')
        for durl_, txt in links_int.items():
            logger_links.debug(f'{durl_.url()} {txt}')
        logger_links.debug('==== external links')
        for durl_, txt in links_ext.items():
            logger_links.debug(f'{durl_.url()} {txt}')

    # keywords from category links
    category_links = set()
    for href, (i, f, rel) in annotations['links'].items():
        if rel and ('category' in rel or 'tag' in rel):
            category_links.add(text[i:f])
    keywords = sorted(category_links)

    # filter out irrelevant sections
    filtered_text, filtered_ann = filter_sections(
        text, annotations, site.boilerplate_texts if site else None
    )

    # debug statistics
    if logger_stats.isEnabledFor(logging.DEBUG):
        sb = annotations['semantic_breaks']
        fsb = filtered_ann['semantic_breaks']
        logger_stats.debug(
            f'Page statistics:'
            f' html_len={len(html)} text_len={len(filtered_text)}'
            f' ratio={len(filtered_text) / len(html):.2f};'
            f' sections={len(sb)} filtered_sections={len(fsb)}'
            f' ratio={len(fsb) / len(sb):.2f} url={durl.url()}'
        )

    return TextResource(
        content_type='html',
        last_change=last_change,
        text_len=len(text),
        lang=lang,
        title=title,
        init_fields={
            'durl': durl,
            'site': site,
            'headers': resp['headers'],
            'redirects': resp['redirects'],
            'links_int': links_int,
            'links_ext': links_ext,
            'shortlink': shortlink,
            'canonical': True if canonical else None,
            'head': head,
        },
        search_fields={
            'title': title,
            'pub_date': last_change,
            'keywords': keywords,
            'text': filtered_text,
            'annotations': filtered_ann,
            'head': str(head),
        },
    )


def filter_sections(text, annotations, boilerplate_texts):
    """
    Filter out irrelevant sections using scores and factoring in neighbors.
    """
    tags = annotations['tags']
    sb = annotations['semantic_breaks']
    section_ids = annotations['section_ids']

    # for i1,f1 in sorted(tags.keys()):
    #    print('           ', i1,f1,tags[(i1,f1)], text[i1:f1])
    # for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
    #    print('-' * lvl, i,f,','.join(tags[(i+1, f)]), sb[i], txt)
    # print('_' * 50)
    # from pprint import pprint
    # pprint(sb)
    # pprint(tags)
    # pprint(section_ids)

    # calculate keep scores for sections
    # negative scores mean: drop; positive scores mean keep;
    # scores between -2 and 2 are undecided
    sections_keep = {}
    headline_probs = {}
    for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
        if prob := headline_probability(txt, tags[(i, f)], lvl):
            headline_probs[(i, f)] = prob
        w = 0
        n_chars = f - i - 1
        # string length
        w = (n_chars - 80) / 80  # initial weight
        # punctuation
        w += 0.4 * text.count('.') + 0.1 * text.count(',')
        # p tag
        if 'p' in tags[(i + 1, f)]:  # prefer keeping paragraphs
            w += 0.7
        # links
        n_links, link_density, avg_text_len = get_tag_counts(
            ('a',), i, f, tags, text
        )
        if link_density > 0.5:
            w = -n_links
        elif link_density > 0.3 and avg_text_len < 60:
            w = -3
        else:
            n_li, li_density, li_len = get_tag_counts(
                ('li',), i, f, tags, text
            )
            if link_density > 0.2 and li_density > 0.8 and li_len < 50:
                w = -3
        if 52 <= lvl < 60:
            w = max(w, 1.0)
        if 'sidebar' in ' '.join(section_ids.get(i, [])):
            w = -3
        if len(txt) < 20 and ('RSS' in txt or 'MENU' in txt):
            w = -3
        # special chars
        if txt.startswith('←') or txt.endswith('→'):  # wordpress navigation
            w = -3
        # remove boilerplate texts
        if boilerplate_texts and txt in boilerplate_texts:
            w = -10
        sections_keep[(i, f)] = w, lvl

    # amend keep scores: look at preceding / subsequent sections with
    # equal level and transfer their keep scores to the current section
    n = len(sections_keep)
    sections = list(sorted(sections_keep.keys()))
    # inspect subsequent sections:
    for rev_ind, s_range in enumerate(reversed(sections)):
        ind = n - 1 - rev_ind
        w, lvl = sections_keep[s_range]
        if abs(w) <= 2:
            w_sum = 0
            n_peers = 0
            for i in range(ind + 1, min(n, ind + 15)):
                w_, lvl_ = sections_keep[sections[i]]
                if lvl_ != lvl:
                    break
                n_peers += 1
                w_sum += w_
            if n_peers >= 3:
                sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
    # inspect preceding sections:
    for ind, s_range in enumerate(sections):
        w, lvl = sections_keep[s_range]
        if abs(w) <= 2:
            w_sum = 0
            n_peers = 0
            for i in range(ind - 1, max(0, ind - 15), -1):
                w_, lvl_ = sections_keep[sections[i]]
                if lvl_ != lvl:
                    break
                n_peers += 1
                w_sum += w_
            if n_peers >= 3:
                sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl

    # amend keep scores: look at sections that could be headlines
    # for subsequent kept sections and increase their score;
    # also allow for up to 2 sections inbetween (which will also
    # have their score increased)
    for rev_ind, s_range in enumerate(reversed(sections)):
        ind = n - 1 - rev_ind
        w, lvl = sections_keep[s_range]
        if abs(w) <= 2:
            if headline_probs.get(s_range, 0) > 0.49:
                # look at subsequent sections with higher level
                child_weights = []
                for i in range(ind + 1, n):
                    w_, lvl_ = sections_keep[sections[i]]
                    if lvl_ <= lvl or w_ < -2:
                        break
                    child_weights.append(w_)
                if nc := len(child_weights):
                    child_avg = sum(child_weights) / nc
                    if w + 1.2 * child_avg > 2:
                        sections_keep[s_range] = w + 1.2 * child_avg, lvl
                        if nc > 1:
                            if (w1 := child_weights[0]) <= 2:
                                sections_keep[sections[ind + 1]] = (
                                    w1 + 1.5 * child_avg,
                                    lvl,
                                )
                        if nc > 2:
                            if (w2 := child_weights[1]) <= 2:
                                sections_keep[sections[ind + 2]] = (
                                    w2 + 2 * child_avg,
                                    lvl,
                                )

    # clean annotations
    clean_annotations(annotations)

    # debug sections
    if logger_sections.isEnabledFor(logging.DEBUG):
        logger_sections.debug('============= Weighted sections =============')
        for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
            w, lvl = sections_keep[(i, f)]
            indent = ('+' if w > 2 else '-') * lvl
            ts = ','.join(tags[(i + 1, f)])
            logger_sections.debug(f'{indent} {i} {f} {ts} {txt} {w:.2f}')

    # narrow down annotations and text to keep_sections
    # drop undecided sections
    filtered_text = text
    filtered_ann = deepcopy(annotations)
    for i, f in sorted(sections_keep.keys(), reverse=True):
        w, lvl = sections_keep[(i, f)]
        if w <= 2.0:
            filtered_ann = annotations_remove_section(filtered_ann, i, f)
            filtered_text = filtered_text[:i] + filtered_text[f:]
    clean_annotations(filtered_ann)

    # debug filtered sections
    if logger_sections.isEnabledFor(logging.DEBUG):
        logger_sections.debug('')
        logger_sections.debug('============= Filtered sections =============')
        fsb = filtered_ann['semantic_breaks']
        ftags = filtered_ann['tags']
        for i, f, lvl, txt in iter_sections(filtered_text, fsb, max_level=100):
            indent = ' ' * lvl
            ts = ','.join(ftags.get((i + 1, f), []))
            logger_sections.debug(f'{indent} {lvl} {i} {f} {ts} {txt}')

    return filtered_text, filtered_ann