atextcrawler/src/atextcrawler/resource/dedup.py

60 lines
2.1 KiB
Python

"""
Find boilerplate texts.
"""
from collections import Counter
from ..models import TextResource
from ..utils.probe import extract_samples
from ..utils.section import iter_sections
async def store_boilerplate_texts(fetcher, conn, site):
"""
Find and store boilerplate texts of a site.
Fetch the start page and internal sample links obtained from it.
If there are sufficienty frequently appearing text sections,
consider them as boilerplate texts.
If boilerplate_texts were found, update the given site instance.
"""
startpage = await fetcher.fetch(site.base_url, site=site)
if (
not isinstance(startpage, TextResource)
or startpage.content_type != 'html'
):
return
# fetch sample resources
sample_links = extract_samples(startpage.init_fields['links_int'])
resources = [startpage]
for sample_link in sample_links:
if sample_link.path == site.base_url: # avoid duplicate resources
continue # NB: duplicate resources may have different paths
sample_resource = await fetcher.fetch(sample_link.url(), site=None)
if (
isinstance(sample_resource, TextResource)
and sample_resource.content_type == 'html'
):
resources.append(sample_resource)
# find common texts in resources
if (n_resources := len(resources)) > 2:
text_freq = Counter()
for resource in resources:
text = resource.search_fields['text']
semantic_breaks = resource.search_fields['annotations'][
'semantic_breaks'
]
for sec in iter_sections(text, semantic_breaks):
text_freq[sec[3]] += 1
boilerplate_texts = []
if min(text_freq.values() or [0]) == 1: # no resource fetched twice
for text, freq in text_freq.items():
if freq > 2:
boilerplate_texts.append(text)
sql = "UPDATE site SET boilerplate_texts=$1 WHERE id=$2"
await conn.execute(sql, boilerplate_texts, site.id_)
site.boilerplate_texts = boilerplate_texts