atextcrawler/src/atextcrawler/resource/sitemap.py

"""
Sitemap and SitemapIndex and related operations.
"""

import logging
from datetime import datetime
from typing import Optional

import pytz

from ..models import Sitemap, SitemapIndex, TextResource

logger = logging.getLogger(__name__)


async def get_sitemap_urls(
    fetcher,
    base_url: Optional[str],
    sitemaps=None,
) -> list[dict]:
    """
    Try to find sitemaps and fetch and return their URL content.

    Each sitemapped URL is a dict with key 'loc' and optional key 'lastmod'.
    """
    if sitemaps:
        # test example: https://www.berlin.de/
        check_all = True
    elif base_url:
        sitemaps = [
            base_url.rstrip('/') + '/sitemap.xml',
            base_url.rstrip('/') + '/wp-sitemap.xml',
            base_url.rstrip('/') + '/sitemap_index.xml',
            base_url.rstrip('/') + '/sitemap.xml.gz',
            base_url.rstrip('/') + '/sitemap_index.xml.gz',
            base_url.rstrip('/') + '/sitemap.txt',
            base_url.rstrip('/') + '/sitemap/',
            base_url.rstrip('/') + '/sitemap1.xml',
            base_url.rstrip('/') + '/sitemap-index.xml',
            base_url.rstrip('/') + '/sitemapindex.xml',
            base_url.rstrip('/') + '/sitemap/index.xml',
        ]
        check_all = False
    else:
        return []
    urls = []
    for sitemap in sitemaps:
        resource = await fetcher.fetch(sitemap)
        found = True
        if isinstance(resource, SitemapIndex):
            for sitemap_ in resource.sitemaps:
                sitemaps.append(sitemap_['loc'])
        elif isinstance(resource, Sitemap):
            urls += resource.urls
        elif isinstance(resource, TextResource) and resource.content_type in (
            'html',
            'plain',
        ):
            urls += [
                {'loc': durl.url()}
                for durl in resource.init_fields['links_int']
            ]
        else:
            found = False
        if found and not check_all:
            break
    return urls


def parse_sitemapindex(sitemapindex):
    """
    Parse a sitemap index returning a `SitemapIndex` with found sitemaps.
    """
    sitemaps = []
    for tag in sitemapindex.find_all('sitemap'):
        if loc := tag.find('loc'):
            if loc.string:
                sitemap = {'loc': loc.string.strip()}
                if lastmod := tag.find('lastmod'):
                    try:
                        t = datetime.fromisoformat(lastmod.string.strip())
                        sitemap['lastmod'] = t
                    except:
                        pass
                sitemaps.append(sitemap)
    return SitemapIndex(sitemaps=sitemaps)


def parse_sitemap(urlset) -> Sitemap:
    """
    Return a list of sitemap URLs.

    Each URL is a dict with these keys+values:

      * loc: the full URL of a mapped resource
      * lastmod: optional datetime of its last modification
      * changefreq: optional info on the change frequency to be expected
      * priority: optional info on its priority relative to other resources

    Cf. https://www.sitemaps.org/protocol.html
    """
    urls = []
    for tag in urlset.find_all('url'):
        if loc := tag.find('loc'):
            if loc.string:
                url = {'loc': loc.string.strip()}
                if lastmod := tag.find('lastmod'):
                    try:
                        t = lastmod.string.strip().rstrip('Z')
                        url['lastmod'] = (
                            datetime.fromisoformat(t)
                            .astimezone(pytz.utc)
                            .replace(tzinfo=None)
                        )
                    except:
                        pass
                if changefreq := tag.find('changefreq'):
                    url['changefreq'] = changefreq.string.strip()
                if priority := tag.find('priority'):
                    url['priority'] = priority.string.strip()
                urls.append(url)
    return Sitemap(urls=urls)


def extract_sitemap_paths(
    base_url: Optional[str],
    urls: list[dict],
) -> tuple[list[tuple[str, bool]], Optional[datetime]]:
    """
    Extract essential information from sitemap URLs.

    Return a list of relative paths of the site's resources
    (in a form to be easily fed into `add_site_paths`) and
    the datetime of the latest change.

    Relative paths are computed using base_url.
    """
    paths = []
    latest = None
    for url in urls:
        loc = url['loc']
        lastmod = url.get('lastmod')
        if loc.startswith(base_url or ''):
            path = loc.removeprefix(base_url or '').lstrip('/')
            path = path.split('#', 1)[0]
            paths.append((path, True))
            if lastmod:
                latest = max(lastmod, latest or lastmod)
    return paths, latest