60 lines
2.1 KiB
Python
60 lines
2.1 KiB
Python
"""
|
|
Find boilerplate texts.
|
|
"""
|
|
|
|
from collections import Counter
|
|
|
|
from ..models import TextResource
|
|
from ..utils.probe import extract_samples
|
|
from ..utils.section import iter_sections
|
|
|
|
|
|
async def store_boilerplate_texts(fetcher, conn, site):
|
|
"""
|
|
Find and store boilerplate texts of a site.
|
|
|
|
Fetch the start page and internal sample links obtained from it.
|
|
If there are sufficienty frequently appearing text sections,
|
|
consider them as boilerplate texts.
|
|
|
|
If boilerplate_texts were found, update the given site instance.
|
|
"""
|
|
startpage = await fetcher.fetch(site.base_url, site=site)
|
|
if (
|
|
not isinstance(startpage, TextResource)
|
|
or startpage.content_type != 'html'
|
|
):
|
|
return
|
|
|
|
# fetch sample resources
|
|
sample_links = extract_samples(startpage.init_fields['links_int'])
|
|
resources = [startpage]
|
|
for sample_link in sample_links:
|
|
if sample_link.path == site.base_url: # avoid duplicate resources
|
|
continue # NB: duplicate resources may have different paths
|
|
sample_resource = await fetcher.fetch(sample_link.url(), site=None)
|
|
if (
|
|
isinstance(sample_resource, TextResource)
|
|
and sample_resource.content_type == 'html'
|
|
):
|
|
resources.append(sample_resource)
|
|
|
|
# find common texts in resources
|
|
if (n_resources := len(resources)) > 2:
|
|
text_freq = Counter()
|
|
for resource in resources:
|
|
text = resource.search_fields['text']
|
|
semantic_breaks = resource.search_fields['annotations'][
|
|
'semantic_breaks'
|
|
]
|
|
for sec in iter_sections(text, semantic_breaks):
|
|
text_freq[sec[3]] += 1
|
|
boilerplate_texts = []
|
|
if min(text_freq.values() or [0]) == 1: # no resource fetched twice
|
|
for text, freq in text_freq.items():
|
|
if freq > 2:
|
|
boilerplate_texts.append(text)
|
|
sql = "UPDATE site SET boilerplate_texts=$1 WHERE id=$2"
|
|
await conn.execute(sql, boilerplate_texts, site.id_)
|
|
site.boilerplate_texts = boilerplate_texts
|