256 lines
8.4 KiB
Python
256 lines
8.4 KiB
Python
"""
|
|
Parsing of a site's startpage.
|
|
"""
|
|
|
|
import re
|
|
from datetime import datetime
|
|
from typing import Any, Optional
|
|
|
|
from ..models import Site, TextResource
|
|
from ..resource import feed_types
|
|
from ..utils.durl import Durl, get_ips
|
|
from ..utils.html import clean_html
|
|
from ..utils.lang import clean_lang
|
|
from ..utils.link import (
|
|
extract_domain,
|
|
in_blacklist,
|
|
link_rels,
|
|
meta_names,
|
|
meta_props,
|
|
)
|
|
|
|
re_meta_keyword_sep = re.compile('[,;\r\n]')
|
|
|
|
|
|
def cut_str(s: Optional[str], l: int) -> Optional[str]:
|
|
"""
|
|
Cut a string *s* to a maximal length *l* from the left.
|
|
"""
|
|
return s[:l] if s else None
|
|
|
|
|
|
async def parse_startpage(
|
|
startpage: TextResource, app=None, site=None
|
|
) -> Site:
|
|
"""
|
|
Parse a site's startpage and return a Site instance.
|
|
|
|
If a site instance is given, update it.
|
|
"""
|
|
durl = startpage.init_fields['durl']
|
|
soup = startpage.init_fields['head']
|
|
meta = collect_meta_tags(soup)
|
|
meta_links = await collect_meta_links(soup, durl)
|
|
links_ext = await collect_external_links(startpage, meta_links)
|
|
links_int = startpage.init_fields['links_int']
|
|
langs = extract_languages(startpage, meta, meta_links)
|
|
title, description, keywords = extract_meta_texts(startpage, meta)
|
|
|
|
# feeds
|
|
feeds = meta_links['feeds']
|
|
if 'wordpress' in (meta.get('generator') or '').lower():
|
|
url = durl.site() + 'feed/'
|
|
feeds[url] = 'application/rss+xml'
|
|
# TODO later: maybe also probe other possible feed paths 'rss', 'rss/'
|
|
|
|
# network params (canonical_url, base_urls, domains)
|
|
ips = await get_ips(durl.hostname)
|
|
redirects = []
|
|
for redirect in startpage.init_fields['redirects']:
|
|
redir_url = await Durl(redirect)
|
|
if redir_url:
|
|
redirects.append(redir_url.site())
|
|
base_urls = redirects + [durl.url()]
|
|
domains = [extract_domain(durl.hostname)]
|
|
|
|
if site: # update an existing Site
|
|
site.canonical_url = meta_links['canonical_url'] or site.canonical_url
|
|
site.base_urls = base_urls
|
|
site.domains = domains
|
|
site.ips = ips
|
|
site.last_update = datetime.utcnow()
|
|
site.last_pub = startpage.last_change
|
|
site.langs = langs
|
|
site.alt_langs = meta_links['alt_langs']
|
|
site.title = title
|
|
site.description = description
|
|
site.keywords = keywords
|
|
site.linkbacks.update(meta_links['linkbacks'])
|
|
site.meta_info = meta
|
|
site.__post_init__(
|
|
base_durl=durl,
|
|
feeds=feeds,
|
|
links_ext=links_ext,
|
|
links_int=links_int,
|
|
startpage_text=startpage.search_fields['text'],
|
|
)
|
|
else: # create new Site instance
|
|
site = Site(
|
|
# post_init fields
|
|
base_durl=durl,
|
|
feeds=feeds,
|
|
links_ext=links_ext,
|
|
links_int=links_int,
|
|
startpage_text=startpage.search_fields['text'],
|
|
# dataclass fields
|
|
canonical_url=meta_links['canonical_url'],
|
|
base_urls=base_urls,
|
|
domains=domains,
|
|
ips=ips,
|
|
last_update=datetime.utcnow(),
|
|
last_pub=startpage.last_change,
|
|
langs=list(langs),
|
|
alt_langs=meta_links['alt_langs'],
|
|
title=title,
|
|
description=description,
|
|
keywords=keywords,
|
|
linkbacks=meta_links['linkbacks'],
|
|
meta_info=meta,
|
|
)
|
|
if site.ips is None and site.url:
|
|
site.ips = await get_ips(site.url.hostname)
|
|
if app and site.startpage_text:
|
|
site_filter = app.plugins['filter_site'].site_filter
|
|
site.crawl_enabled = await site_filter(site)
|
|
return site
|
|
|
|
|
|
def collect_meta_tags(soup):
|
|
"""
|
|
Collect selected meta tags (meta_names and meta_props) with their values.
|
|
"""
|
|
meta = {}
|
|
for tag in soup.find_all('meta'):
|
|
if (name := tag.get('name')) and name in meta_names:
|
|
meta[name] = tag.get('content')
|
|
if (property := tag.get('property')) in meta_props:
|
|
if content := tag.get('content'):
|
|
meta[property] = content
|
|
if tag.get('http-equiv') == 'content-language': # old html
|
|
if content := tag.get('content'):
|
|
meta['http_equiv_lang'] = content
|
|
return meta
|
|
|
|
|
|
async def collect_meta_links(soup, base_durl) -> dict[str, Any]:
|
|
"""
|
|
Collect link tags with site scope (feeds, linkbacks, canonical, ...).
|
|
"""
|
|
linkbacks = {}
|
|
feeds = {}
|
|
alt_langs = {}
|
|
canonical_url = None
|
|
for tag in soup.find_all('link'):
|
|
if not (rels := set(tag.get('rel', []))) or not rels & link_rels:
|
|
continue
|
|
if not (url := tag.get('href')):
|
|
continue
|
|
if not (link_durl := await Durl(url, base=base_durl)):
|
|
continue
|
|
if in_blacklist(link_durl.hostname):
|
|
continue
|
|
link_url = link_durl.url()
|
|
link_type = tag.get('type')
|
|
if link_type in feed_types:
|
|
feeds[link_url] = link_type
|
|
elif 'canonical' in rels:
|
|
canonical_url = link_url
|
|
elif 'alternate' in rels and (hreflang := tag.get('hreflang')):
|
|
if lang := clean_lang(hreflang):
|
|
alt_langs[lang] = link_durl.url()
|
|
elif 'webmention' in rels:
|
|
linkbacks[link_url] = 'webmention'
|
|
elif 'pingback' in rels:
|
|
linkbacks[link_url] = 'pingback'
|
|
if canonical_url:
|
|
if canonical_durl := await Durl(canonical_url):
|
|
canonical_url = canonical_durl.site()
|
|
else:
|
|
canonical_url = None
|
|
return {
|
|
'feeds': feeds,
|
|
'linkbacks': linkbacks,
|
|
'alt_langs': alt_langs,
|
|
'canonical_url': canonical_url,
|
|
}
|
|
|
|
|
|
async def collect_external_links(startpage, meta_links) -> dict[str, str]:
|
|
"""
|
|
Return external links (mapping from URL to link text) from startpage.
|
|
|
|
Also add links to alternate language variants of the site.
|
|
"""
|
|
external_links = startpage.init_fields['links_ext'].copy()
|
|
netloc = startpage.init_fields['durl'].netloc
|
|
for lang, lang_url in meta_links['alt_langs'].items():
|
|
if netloc not in lang_url:
|
|
durl = await Durl(lang_url)
|
|
if durl:
|
|
external_links[durl] = f'Alternate language: {lang}'
|
|
return external_links
|
|
|
|
|
|
def extract_meta_texts(page, meta) -> tuple[str, Optional[str], list[str]]:
|
|
"""
|
|
Extract and return title, description, keywords from a page and meta tags.
|
|
"""
|
|
title = meta.get('og:site_name')
|
|
if not title:
|
|
title = page.search_fields['title'] or ''
|
|
if meta_title := meta.pop('title', None):
|
|
if meta_title.lower() not in title.lower():
|
|
title += ('; ' if title else '') + meta_title
|
|
title = cut_str(clean_html(title), 200)
|
|
description = cut_str(clean_html(meta.pop('description', None)), 2000)
|
|
if meta_keywords := meta.pop('keywords', None):
|
|
kws = re_meta_keyword_sep.split(meta_keywords)
|
|
keywords = [kw.strip()[:50] for kw in kws if kw.strip()]
|
|
if len(keywords) < 2:
|
|
keywords = [
|
|
kw.strip()[:50]
|
|
for kw in meta_keywords.split(' ')
|
|
if kw.strip()
|
|
]
|
|
else:
|
|
keywords = []
|
|
return title, description, keywords
|
|
|
|
|
|
def extract_languages(page, meta, meta_links) -> set[str]:
|
|
"""
|
|
Extract languages from a page's html tag, meta tags and HTTP headers.
|
|
|
|
Also add the language detected in the text content of the page.
|
|
|
|
Return a set of ISO 639-1 language codes.
|
|
|
|
See also https://www.w3.org/International/questions/qa-http-and-lang and
|
|
https://www.w3.org/International/questions/qa-html-language-declarations
|
|
"""
|
|
languages = set()
|
|
if lang := clean_lang(page.lang):
|
|
languages.add(lang)
|
|
if lang := clean_lang(meta.get('http_equiv_lang')):
|
|
languages.add(lang)
|
|
if lang := clean_lang(meta.get('dc.language')):
|
|
languages.add(lang)
|
|
if lang := clean_lang(meta.get('og:locale')):
|
|
languages.add(lang)
|
|
for lang, lang_url in meta_links['alt_langs'].items():
|
|
if page.init_fields['durl'].netloc in lang_url:
|
|
if lng := clean_lang(lang):
|
|
languages.add(lng)
|
|
lngs = (
|
|
page.init_fields['headers']
|
|
.get('Content-Language', '')
|
|
.lower()
|
|
.replace(' ', '')
|
|
.split(',')
|
|
)
|
|
for lng in lngs:
|
|
if lang := clean_lang(lng):
|
|
languages.add(lang)
|
|
languages.add(page.lang)
|
|
return languages
|