atextcrawler/src/atextcrawler/resource/dedup.py

"""
Find boilerplate texts.
"""

from collections import Counter

from ..models import TextResource
from ..utils.probe import extract_samples
from ..utils.section import iter_sections


async def store_boilerplate_texts(fetcher, conn, site):
    """
    Find and store boilerplate texts of a site.

    Fetch the start page and internal sample links obtained from it.
    If there are sufficienty frequently appearing text sections,
    consider them as boilerplate texts.

    If boilerplate_texts were found, update the given site instance.
    """
    startpage = await fetcher.fetch(site.base_url, site=site)
    if (
        not isinstance(startpage, TextResource)
        or startpage.content_type != 'html'
    ):
        return

    # fetch sample resources
    sample_links = extract_samples(startpage.init_fields['links_int'])
    resources = [startpage]
    for sample_link in sample_links:
        if sample_link.path == site.base_url:  # avoid duplicate resources
            continue  # NB: duplicate resources may have different paths
        sample_resource = await fetcher.fetch(sample_link.url(), site=None)
        if (
            isinstance(sample_resource, TextResource)
            and sample_resource.content_type == 'html'
        ):
            resources.append(sample_resource)

    # find common texts in resources
    if (n_resources := len(resources)) > 2:
        text_freq = Counter()
        for resource in resources:
            text = resource.search_fields['text']
            semantic_breaks = resource.search_fields['annotations'][
                'semantic_breaks'
            ]
            for sec in iter_sections(text, semantic_breaks):
                text_freq[sec[3]] += 1
        boilerplate_texts = []
        if min(text_freq.values() or [0]) == 1:  # no resource fetched twice
            for text, freq in text_freq.items():
                if freq > 2:
                    boilerplate_texts.append(text)
            sql = "UPDATE site SET boilerplate_texts=$1 WHERE id=$2"
            await conn.execute(sql, boilerplate_texts, site.id_)
            site.boilerplate_texts = boilerplate_texts