48 lines
1.2 KiB
Python
48 lines
1.2 KiB
Python
"""
|
|
Relevance estimation of sites.
|
|
|
|
This plugin implements :func:`site_filter`.
|
|
"""
|
|
|
|
import re
|
|
|
|
from atextcrawler.models import Site
|
|
|
|
MIN_RELEVANCE_SCORE = 5
|
|
|
|
|
|
async def site_filter(site: Site) -> bool:
|
|
"""
|
|
Assess relevance of the site (using language-dependent criteria).
|
|
|
|
If the site shall be crawled, return True, else False.
|
|
"""
|
|
# limit to sites in English or German language
|
|
if not set(['de', 'en']) & set(site.langs):
|
|
return False
|
|
score = 0.0
|
|
for crit_name, weight, langs, crit_re in re_criteria:
|
|
if '*' in langs or set(langs) & set(site.langs):
|
|
findings = crit_re.findall(site.startpage_text)
|
|
if findings:
|
|
score += weight * len(findings)
|
|
if site.title and crit_re.search(site.title):
|
|
score += 4 * weight
|
|
if site.description and crit_re.search(site.description):
|
|
score += 4 * weight
|
|
|
|
# TODO: add criteria for named entities (FdA-IFA, FAU, ...)
|
|
|
|
return score >= MIN_RELEVANCE_SCORE
|
|
|
|
|
|
re_criteria = {
|
|
(
|
|
'anarch',
|
|
1.0,
|
|
('*',),
|
|
re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
|
|
),
|
|
('libertär', 0.5, ('de'), re.compile('(libert(är|är))', re.I)),
|
|
}
|