""" Parsing of a site's startpage. """ import re from datetime import datetime from typing import Any, Optional from ..models import Site, TextResource from ..resource import feed_types from ..utils.durl import Durl, get_ips from ..utils.html import clean_html from ..utils.lang import clean_lang from ..utils.link import ( extract_domain, in_blacklist, link_rels, meta_names, meta_props, ) re_meta_keyword_sep = re.compile('[,;\r\n]') def cut_str(s: Optional[str], l: int) -> Optional[str]: """ Cut a string *s* to a maximal length *l* from the left. """ return s[:l] if s else None async def parse_startpage( startpage: TextResource, app=None, site=None ) -> Site: """ Parse a site's startpage and return a Site instance. If a site instance is given, update it. """ durl = startpage.init_fields['durl'] soup = startpage.init_fields['head'] meta = collect_meta_tags(soup) meta_links = await collect_meta_links(soup, durl) links_ext = await collect_external_links(startpage, meta_links) links_int = startpage.init_fields['links_int'] langs = extract_languages(startpage, meta, meta_links) title, description, keywords = extract_meta_texts(startpage, meta) # feeds feeds = meta_links['feeds'] if 'wordpress' in (meta.get('generator') or '').lower(): url = durl.site() + 'feed/' feeds[url] = 'application/rss+xml' # TODO later: maybe also probe other possible feed paths 'rss', 'rss/' # network params (canonical_url, base_urls, domains) ips = await get_ips(durl.hostname) redirects = [] for redirect in startpage.init_fields['redirects']: redir_url = await Durl(redirect) if redir_url: redirects.append(redir_url.site()) base_urls = redirects + [durl.url()] domains = [extract_domain(durl.hostname)] if site: # update an existing Site site.canonical_url = meta_links['canonical_url'] or site.canonical_url site.base_urls = base_urls site.domains = domains site.ips = ips site.last_update = datetime.utcnow() site.last_pub = startpage.last_change site.langs = langs site.alt_langs = meta_links['alt_langs'] site.title = title site.description = description site.keywords = keywords site.linkbacks.update(meta_links['linkbacks']) site.meta_info = meta site.__post_init__( base_durl=durl, feeds=feeds, links_ext=links_ext, links_int=links_int, startpage_text=startpage.search_fields['text'], ) else: # create new Site instance site = Site( # post_init fields base_durl=durl, feeds=feeds, links_ext=links_ext, links_int=links_int, startpage_text=startpage.search_fields['text'], # dataclass fields canonical_url=meta_links['canonical_url'], base_urls=base_urls, domains=domains, ips=ips, last_update=datetime.utcnow(), last_pub=startpage.last_change, langs=list(langs), alt_langs=meta_links['alt_langs'], title=title, description=description, keywords=keywords, linkbacks=meta_links['linkbacks'], meta_info=meta, ) if site.ips is None and site.url: site.ips = await get_ips(site.url.hostname) if app and site.startpage_text: site_filter = app.plugins['filter_site'].site_filter site.crawl_enabled = await site_filter(site) return site def collect_meta_tags(soup): """ Collect selected meta tags (meta_names and meta_props) with their values. """ meta = {} for tag in soup.find_all('meta'): if (name := tag.get('name')) and name in meta_names: meta[name] = tag.get('content') if (property := tag.get('property')) in meta_props: if content := tag.get('content'): meta[property] = content if tag.get('http-equiv') == 'content-language': # old html if content := tag.get('content'): meta['http_equiv_lang'] = content return meta async def collect_meta_links(soup, base_durl) -> dict[str, Any]: """ Collect link tags with site scope (feeds, linkbacks, canonical, ...). """ linkbacks = {} feeds = {} alt_langs = {} canonical_url = None for tag in soup.find_all('link'): if not (rels := set(tag.get('rel', []))) or not rels & link_rels: continue if not (url := tag.get('href')): continue if not (link_durl := await Durl(url, base=base_durl)): continue if in_blacklist(link_durl.hostname): continue link_url = link_durl.url() link_type = tag.get('type') if link_type in feed_types: feeds[link_url] = link_type elif 'canonical' in rels: canonical_url = link_url elif 'alternate' in rels and (hreflang := tag.get('hreflang')): if lang := clean_lang(hreflang): alt_langs[lang] = link_durl.url() elif 'webmention' in rels: linkbacks[link_url] = 'webmention' elif 'pingback' in rels: linkbacks[link_url] = 'pingback' if canonical_url: if canonical_durl := await Durl(canonical_url): canonical_url = canonical_durl.site() else: canonical_url = None return { 'feeds': feeds, 'linkbacks': linkbacks, 'alt_langs': alt_langs, 'canonical_url': canonical_url, } async def collect_external_links(startpage, meta_links) -> dict[str, str]: """ Return external links (mapping from URL to link text) from startpage. Also add links to alternate language variants of the site. """ external_links = startpage.init_fields['links_ext'].copy() netloc = startpage.init_fields['durl'].netloc for lang, lang_url in meta_links['alt_langs'].items(): if netloc not in lang_url: durl = await Durl(lang_url) if durl: external_links[durl] = f'Alternate language: {lang}' return external_links def extract_meta_texts(page, meta) -> tuple[str, Optional[str], list[str]]: """ Extract and return title, description, keywords from a page and meta tags. """ title = meta.get('og:site_name') if not title: title = page.search_fields['title'] or '' if meta_title := meta.pop('title', None): if meta_title.lower() not in title.lower(): title += ('; ' if title else '') + meta_title title = cut_str(clean_html(title), 200) description = cut_str(clean_html(meta.pop('description', None)), 2000) if meta_keywords := meta.pop('keywords', None): kws = re_meta_keyword_sep.split(meta_keywords) keywords = [kw.strip()[:50] for kw in kws if kw.strip()] if len(keywords) < 2: keywords = [ kw.strip()[:50] for kw in meta_keywords.split(' ') if kw.strip() ] else: keywords = [] return title, description, keywords def extract_languages(page, meta, meta_links) -> set[str]: """ Extract languages from a page's html tag, meta tags and HTTP headers. Also add the language detected in the text content of the page. Return a set of ISO 639-1 language codes. See also https://www.w3.org/International/questions/qa-http-and-lang and https://www.w3.org/International/questions/qa-html-language-declarations """ languages = set() if lang := clean_lang(page.lang): languages.add(lang) if lang := clean_lang(meta.get('http_equiv_lang')): languages.add(lang) if lang := clean_lang(meta.get('dc.language')): languages.add(lang) if lang := clean_lang(meta.get('og:locale')): languages.add(lang) for lang, lang_url in meta_links['alt_langs'].items(): if page.init_fields['durl'].netloc in lang_url: if lng := clean_lang(lang): languages.add(lng) lngs = ( page.init_fields['headers'] .get('Content-Language', '') .lower() .replace(' ', '') .split(',') ) for lng in lngs: if lang := clean_lang(lng): languages.add(lang) languages.add(page.lang) return languages