atextcrawler/src/atextcrawler/resource/sitemap.py

"""
Sitemap and SitemapIndex and related operations.
"""

import logging
from datetime import datetime
from typing import Optional

import pytz

from ..models import Sitemap, SitemapIndex, TextResource

logger = logging.getLogger(__name__)


async def get_sitemap_urls(
    fetcher,
    base_url: Optional[str],
    sitemaps=None,
) -> list[dict]:
    """
    Try to find sitemaps and fetch and return their URL content.

    Each sitemapped URL is a dict with key 'loc' and optional key 'lastmod'.
    """
    if sitemaps:
        # test example: https://www.berlin.de/
        check_all = True
    elif base_url:
        sitemaps = [
            base_url.rstrip('/') + '/sitemap.xml',
            base_url.rstrip('/') + '/wp-sitemap.xml',
            base_url.rstrip('/') + '/sitemap_index.xml',
            base_url.rstrip('/') + '/sitemap.xml.gz',
            base_url.rstrip('/') + '/sitemap_index.xml.gz',
            base_url.rstrip('/') + '/sitemap.txt',
            base_url.rstrip('/') + '/sitemap/',
            base_url.rstrip('/') + '/sitemap1.xml',
            base_url.rstrip('/') + '/sitemap-index.xml',
            base_url.rstrip('/') + '/sitemapindex.xml',
            base_url.rstrip('/') + '/sitemap/index.xml',
        ]
        check_all = False
    else:
        return []
    urls = []
    for sitemap in sitemaps:
        resource = await fetcher.fetch(sitemap)
        found = True
        if isinstance(resource, SitemapIndex):
            for sitemap_ in resource.sitemaps:
                sitemaps.append(sitemap_['loc'])
        elif isinstance(resource, Sitemap):
            urls += resource.urls
        elif isinstance(resource, TextResource) and resource.content_type in (
            'html',
            'plain',
        ):
            urls += [
                {'loc': durl.url()}
                for durl in resource.init_fields['links_int']
            ]
        else:
            found = False
        if found and not check_all:
            break
    return urls


def parse_sitemapindex(sitemapindex):
    """
    Parse a sitemap index returning a `SitemapIndex` with found sitemaps.
    """
    sitemaps = []
    for tag in sitemapindex.find_all('sitemap'):
        if loc := tag.find('loc'):
            if loc.string:
                sitemap = {'loc': loc.string.strip()}
                if lastmod := tag.find('lastmod'):
                    try:
                        t = datetime.fromisoformat(lastmod.string.strip())
                        sitemap['lastmod'] = t
                    except:
                        pass
                sitemaps.append(sitemap)
    return SitemapIndex(sitemaps=sitemaps)


def parse_sitemap(urlset) -> Sitemap:
    """
    Return a list of sitemap URLs.

    Each URL is a dict with these keys+values:

      * loc: the full URL of a mapped resource
      * lastmod: optional datetime of its last modification
      * changefreq: optional info on the change frequency to be expected
      * priority: optional info on its priority relative to other resources

    Cf. https://www.sitemaps.org/protocol.html
    """
    urls = []
    for tag in urlset.find_all('url'):
        if loc := tag.find('loc'):
            if loc.string:
                url = {'loc': loc.string.strip()}
                if lastmod := tag.find('lastmod'):
                    try:
                        t = lastmod.string.strip().rstrip('Z')
                        url['lastmod'] = (
                            datetime.fromisoformat(t)
                            .astimezone(pytz.utc)
                            .replace(tzinfo=None)
                        )
                    except:
                        pass
                if changefreq := tag.find('changefreq'):
                    url['changefreq'] = changefreq.string.strip()
                if priority := tag.find('priority'):
                    url['priority'] = priority.string.strip()
                urls.append(url)
    return Sitemap(urls=urls)


def extract_sitemap_paths(
    base_url: Optional[str],
    urls: list[dict],
) -> tuple[list[tuple[str, bool]], Optional[datetime]]:
    """
    Extract essential information from sitemap URLs.

    Return a list of relative paths of the site's resources
    (in a form to be easily fed into `add_site_paths`) and
    the datetime of the latest change.

    Relative paths are computed using base_url.
    """
    paths = []
    latest = None
    for url in urls:
        loc = url['loc']
        lastmod = url.get('lastmod')
        if loc.startswith(base_url or ''):
            path = loc.removeprefix(base_url or '').lstrip('/')
            path = path.split('#', 1)[0]
            paths.append((path, True))
            if lastmod:
                latest = max(lastmod, latest or lastmod)
    return paths, latest
Put under version control 2021-11-29 09:16:31 +00:00			`"""`
			`Sitemap and SitemapIndex and related operations.`
			`"""`

			`import logging`
			`from datetime import datetime`
			`from typing import Optional`

			`import pytz`

			`from ..models import Sitemap, SitemapIndex, TextResource`

			`logger = logging.getLogger(__name__)`


			`async def get_sitemap_urls(`
			`fetcher,`
			`base_url: Optional[str],`
			`sitemaps=None,`
			`) -> list[dict]:`
			`"""`
			`Try to find sitemaps and fetch and return their URL content.`

			`Each sitemapped URL is a dict with key 'loc' and optional key 'lastmod'.`
			`"""`
			`if sitemaps:`
			`# test example: https://www.berlin.de/`
			`check_all = True`
			`elif base_url:`
			`sitemaps = [`
			`base_url.rstrip('/') + '/sitemap.xml',`
			`base_url.rstrip('/') + '/wp-sitemap.xml',`
			`base_url.rstrip('/') + '/sitemap_index.xml',`
			`base_url.rstrip('/') + '/sitemap.xml.gz',`
			`base_url.rstrip('/') + '/sitemap_index.xml.gz',`
			`base_url.rstrip('/') + '/sitemap.txt',`
			`base_url.rstrip('/') + '/sitemap/',`
			`base_url.rstrip('/') + '/sitemap1.xml',`
			`base_url.rstrip('/') + '/sitemap-index.xml',`
			`base_url.rstrip('/') + '/sitemapindex.xml',`
			`base_url.rstrip('/') + '/sitemap/index.xml',`
			`]`
			`check_all = False`
			`else:`
			`return []`
			`urls = []`
			`for sitemap in sitemaps:`
			`resource = await fetcher.fetch(sitemap)`
			`found = True`
			`if isinstance(resource, SitemapIndex):`
			`for sitemap_ in resource.sitemaps:`
			`sitemaps.append(sitemap_['loc'])`
			`elif isinstance(resource, Sitemap):`
			`urls += resource.urls`
			`elif isinstance(resource, TextResource) and resource.content_type in (`
			`'html',`
			`'plain',`
			`):`
			`urls += [`
			`{'loc': durl.url()}`
			`for durl in resource.init_fields['links_int']`
			`]`
			`else:`
			`found = False`
			`if found and not check_all:`
			`break`
			`return urls`


			`def parse_sitemapindex(sitemapindex):`
			`"""`
			Parse a sitemap index returning a `SitemapIndex` with found sitemaps.
			`"""`
			`sitemaps = []`
			`for tag in sitemapindex.find_all('sitemap'):`
			`if loc := tag.find('loc'):`
			`if loc.string:`
			`sitemap = {'loc': loc.string.strip()}`
			`if lastmod := tag.find('lastmod'):`
			`try:`
			`t = datetime.fromisoformat(lastmod.string.strip())`
			`sitemap['lastmod'] = t`
			`except:`
			`pass`
			`sitemaps.append(sitemap)`
			`return SitemapIndex(sitemaps=sitemaps)`


			`def parse_sitemap(urlset) -> Sitemap:`
			`"""`
			`Return a list of sitemap URLs.`

			`Each URL is a dict with these keys+values:`

			`* loc: the full URL of a mapped resource`
			`* lastmod: optional datetime of its last modification`
			`* changefreq: optional info on the change frequency to be expected`
			`* priority: optional info on its priority relative to other resources`

			`Cf. https://www.sitemaps.org/protocol.html`
			`"""`
			`urls = []`
			`for tag in urlset.find_all('url'):`
			`if loc := tag.find('loc'):`
			`if loc.string:`
			`url = {'loc': loc.string.strip()}`
			`if lastmod := tag.find('lastmod'):`
			`try:`
			`t = lastmod.string.strip().rstrip('Z')`
			`url['lastmod'] = (`
			`datetime.fromisoformat(t)`
			`.astimezone(pytz.utc)`
			`.replace(tzinfo=None)`
			`)`
			`except:`
			`pass`
			`if changefreq := tag.find('changefreq'):`
			`url['changefreq'] = changefreq.string.strip()`
			`if priority := tag.find('priority'):`
			`url['priority'] = priority.string.strip()`
			`urls.append(url)`
			`return Sitemap(urls=urls)`


			`def extract_sitemap_paths(`
			`base_url: Optional[str],`
			`urls: list[dict],`
			`) -> tuple[list[tuple[str, bool]], Optional[datetime]]:`
			`"""`
			`Extract essential information from sitemap URLs.`

			`Return a list of relative paths of the site's resources`
			(in a form to be easily fed into `add_site_paths`) and
			`the datetime of the latest change.`

			`Relative paths are computed using base_url.`
			`"""`
			`paths = []`
			`latest = None`
			`for url in urls:`
			`loc = url['loc']`
			`lastmod = url.get('lastmod')`
			`if loc.startswith(base_url or ''):`
			`path = loc.removeprefix(base_url or '').lstrip('/')`
			`path = path.split('#', 1)[0]`
			`paths.append((path, True))`
			`if lastmod:`
			`latest = max(lastmod, latest or lastmod)`
			`return paths, latest`