atextcrawler/src/atextcrawler/site/parse.py

256 lines
8.4 KiB
Python

"""
Parsing of a site's startpage.
"""
import re
from datetime import datetime
from typing import Any, Optional
from ..models import Site, TextResource
from ..resource import feed_types
from ..utils.durl import Durl, get_ips
from ..utils.html import clean_html
from ..utils.lang import clean_lang
from ..utils.link import (
extract_domain,
in_blacklist,
link_rels,
meta_names,
meta_props,
)
re_meta_keyword_sep = re.compile('[,;\r\n]')
def cut_str(s: Optional[str], l: int) -> Optional[str]:
"""
Cut a string *s* to a maximal length *l* from the left.
"""
return s[:l] if s else None
async def parse_startpage(
startpage: TextResource, app=None, site=None
) -> Site:
"""
Parse a site's startpage and return a Site instance.
If a site instance is given, update it.
"""
durl = startpage.init_fields['durl']
soup = startpage.init_fields['head']
meta = collect_meta_tags(soup)
meta_links = await collect_meta_links(soup, durl)
links_ext = await collect_external_links(startpage, meta_links)
links_int = startpage.init_fields['links_int']
langs = extract_languages(startpage, meta, meta_links)
title, description, keywords = extract_meta_texts(startpage, meta)
# feeds
feeds = meta_links['feeds']
if 'wordpress' in (meta.get('generator') or '').lower():
url = durl.site() + 'feed/'
feeds[url] = 'application/rss+xml'
# TODO later: maybe also probe other possible feed paths 'rss', 'rss/'
# network params (canonical_url, base_urls, domains)
ips = await get_ips(durl.hostname)
redirects = []
for redirect in startpage.init_fields['redirects']:
redir_url = await Durl(redirect)
if redir_url:
redirects.append(redir_url.site())
base_urls = redirects + [durl.url()]
domains = [extract_domain(durl.hostname)]
if site: # update an existing Site
site.canonical_url = meta_links['canonical_url'] or site.canonical_url
site.base_urls = base_urls
site.domains = domains
site.ips = ips
site.last_update = datetime.utcnow()
site.last_pub = startpage.last_change
site.langs = langs
site.alt_langs = meta_links['alt_langs']
site.title = title
site.description = description
site.keywords = keywords
site.linkbacks.update(meta_links['linkbacks'])
site.meta_info = meta
site.__post_init__(
base_durl=durl,
feeds=feeds,
links_ext=links_ext,
links_int=links_int,
startpage_text=startpage.search_fields['text'],
)
else: # create new Site instance
site = Site(
# post_init fields
base_durl=durl,
feeds=feeds,
links_ext=links_ext,
links_int=links_int,
startpage_text=startpage.search_fields['text'],
# dataclass fields
canonical_url=meta_links['canonical_url'],
base_urls=base_urls,
domains=domains,
ips=ips,
last_update=datetime.utcnow(),
last_pub=startpage.last_change,
langs=list(langs),
alt_langs=meta_links['alt_langs'],
title=title,
description=description,
keywords=keywords,
linkbacks=meta_links['linkbacks'],
meta_info=meta,
)
if site.ips is None and site.url:
site.ips = await get_ips(site.url.hostname)
if app and site.startpage_text:
site_filter = app.plugins['filter_site'].site_filter
site.crawl_enabled = await site_filter(site)
return site
def collect_meta_tags(soup):
"""
Collect selected meta tags (meta_names and meta_props) with their values.
"""
meta = {}
for tag in soup.find_all('meta'):
if (name := tag.get('name')) and name in meta_names:
meta[name] = tag.get('content')
if (property := tag.get('property')) in meta_props:
if content := tag.get('content'):
meta[property] = content
if tag.get('http-equiv') == 'content-language': # old html
if content := tag.get('content'):
meta['http_equiv_lang'] = content
return meta
async def collect_meta_links(soup, base_durl) -> dict[str, Any]:
"""
Collect link tags with site scope (feeds, linkbacks, canonical, ...).
"""
linkbacks = {}
feeds = {}
alt_langs = {}
canonical_url = None
for tag in soup.find_all('link'):
if not (rels := set(tag.get('rel', []))) or not rels & link_rels:
continue
if not (url := tag.get('href')):
continue
if not (link_durl := await Durl(url, base=base_durl)):
continue
if in_blacklist(link_durl.hostname):
continue
link_url = link_durl.url()
link_type = tag.get('type')
if link_type in feed_types:
feeds[link_url] = link_type
elif 'canonical' in rels:
canonical_url = link_url
elif 'alternate' in rels and (hreflang := tag.get('hreflang')):
if lang := clean_lang(hreflang):
alt_langs[lang] = link_durl.url()
elif 'webmention' in rels:
linkbacks[link_url] = 'webmention'
elif 'pingback' in rels:
linkbacks[link_url] = 'pingback'
if canonical_url:
if canonical_durl := await Durl(canonical_url):
canonical_url = canonical_durl.site()
else:
canonical_url = None
return {
'feeds': feeds,
'linkbacks': linkbacks,
'alt_langs': alt_langs,
'canonical_url': canonical_url,
}
async def collect_external_links(startpage, meta_links) -> dict[str, str]:
"""
Return external links (mapping from URL to link text) from startpage.
Also add links to alternate language variants of the site.
"""
external_links = startpage.init_fields['links_ext'].copy()
netloc = startpage.init_fields['durl'].netloc
for lang, lang_url in meta_links['alt_langs'].items():
if netloc not in lang_url:
durl = await Durl(lang_url)
if durl:
external_links[durl] = f'Alternate language: {lang}'
return external_links
def extract_meta_texts(page, meta) -> tuple[str, Optional[str], list[str]]:
"""
Extract and return title, description, keywords from a page and meta tags.
"""
title = meta.get('og:site_name')
if not title:
title = page.search_fields['title'] or ''
if meta_title := meta.pop('title', None):
if meta_title.lower() not in title.lower():
title += ('; ' if title else '') + meta_title
title = cut_str(clean_html(title), 200)
description = cut_str(clean_html(meta.pop('description', None)), 2000)
if meta_keywords := meta.pop('keywords', None):
kws = re_meta_keyword_sep.split(meta_keywords)
keywords = [kw.strip()[:50] for kw in kws if kw.strip()]
if len(keywords) < 2:
keywords = [
kw.strip()[:50]
for kw in meta_keywords.split(' ')
if kw.strip()
]
else:
keywords = []
return title, description, keywords
def extract_languages(page, meta, meta_links) -> set[str]:
"""
Extract languages from a page's html tag, meta tags and HTTP headers.
Also add the language detected in the text content of the page.
Return a set of ISO 639-1 language codes.
See also https://www.w3.org/International/questions/qa-http-and-lang and
https://www.w3.org/International/questions/qa-html-language-declarations
"""
languages = set()
if lang := clean_lang(page.lang):
languages.add(lang)
if lang := clean_lang(meta.get('http_equiv_lang')):
languages.add(lang)
if lang := clean_lang(meta.get('dc.language')):
languages.add(lang)
if lang := clean_lang(meta.get('og:locale')):
languages.add(lang)
for lang, lang_url in meta_links['alt_langs'].items():
if page.init_fields['durl'].netloc in lang_url:
if lng := clean_lang(lang):
languages.add(lng)
lngs = (
page.init_fields['headers']
.get('Content-Language', '')
.lower()
.replace(' ', '')
.split(',')
)
for lng in lngs:
if lang := clean_lang(lng):
languages.add(lang)
languages.add(page.lang)
return languages