""" Relevance estimation of sites. This plugin implements :func:`site_filter`. """ import re from atextcrawler.models import Site MIN_RELEVANCE_SCORE = 5 async def site_filter(site: Site) -> bool: """ Assess relevance of the site (using language-dependent criteria). If the site shall be crawled, return True, else False. """ # limit to sites in English or German language if not set(['de', 'en']) & set(site.langs): return False score = 0.0 for crit_name, weight, langs, crit_re in re_criteria: if '*' in langs or set(langs) & set(site.langs): findings = crit_re.findall(site.startpage_text) if findings: score += weight * len(findings) if site.title and crit_re.search(site.title): score += 4 * weight if site.description and crit_re.search(site.description): score += 4 * weight # TODO: add criteria for named entities (FdA-IFA, FAU, ...) return score >= MIN_RELEVANCE_SCORE re_criteria = { ( 'anarch', 1.0, ('*',), re.compile('((?