atextcrawler/src/atextcrawler/resource/sitemap.py

150 lines
4.7 KiB
Python

"""
Sitemap and SitemapIndex and related operations.
"""
import logging
from datetime import datetime
from typing import Optional
import pytz
from ..models import Sitemap, SitemapIndex, TextResource
logger = logging.getLogger(__name__)
async def get_sitemap_urls(
fetcher,
base_url: Optional[str],
sitemaps=None,
) -> list[dict]:
"""
Try to find sitemaps and fetch and return their URL content.
Each sitemapped URL is a dict with key 'loc' and optional key 'lastmod'.
"""
if sitemaps:
# test example: https://www.berlin.de/
check_all = True
elif base_url:
sitemaps = [
base_url.rstrip('/') + '/sitemap.xml',
base_url.rstrip('/') + '/wp-sitemap.xml',
base_url.rstrip('/') + '/sitemap_index.xml',
base_url.rstrip('/') + '/sitemap.xml.gz',
base_url.rstrip('/') + '/sitemap_index.xml.gz',
base_url.rstrip('/') + '/sitemap.txt',
base_url.rstrip('/') + '/sitemap/',
base_url.rstrip('/') + '/sitemap1.xml',
base_url.rstrip('/') + '/sitemap-index.xml',
base_url.rstrip('/') + '/sitemapindex.xml',
base_url.rstrip('/') + '/sitemap/index.xml',
]
check_all = False
else:
return []
urls = []
for sitemap in sitemaps:
resource = await fetcher.fetch(sitemap)
found = True
if isinstance(resource, SitemapIndex):
for sitemap_ in resource.sitemaps:
sitemaps.append(sitemap_['loc'])
elif isinstance(resource, Sitemap):
urls += resource.urls
elif isinstance(resource, TextResource) and resource.content_type in (
'html',
'plain',
):
urls += [
{'loc': durl.url()}
for durl in resource.init_fields['links_int']
]
else:
found = False
if found and not check_all:
break
return urls
def parse_sitemapindex(sitemapindex):
"""
Parse a sitemap index returning a `SitemapIndex` with found sitemaps.
"""
sitemaps = []
for tag in sitemapindex.find_all('sitemap'):
if loc := tag.find('loc'):
if loc.string:
sitemap = {'loc': loc.string.strip()}
if lastmod := tag.find('lastmod'):
try:
t = datetime.fromisoformat(lastmod.string.strip())
sitemap['lastmod'] = t
except:
pass
sitemaps.append(sitemap)
return SitemapIndex(sitemaps=sitemaps)
def parse_sitemap(urlset) -> Sitemap:
"""
Return a list of sitemap URLs.
Each URL is a dict with these keys+values:
* loc: the full URL of a mapped resource
* lastmod: optional datetime of its last modification
* changefreq: optional info on the change frequency to be expected
* priority: optional info on its priority relative to other resources
Cf. https://www.sitemaps.org/protocol.html
"""
urls = []
for tag in urlset.find_all('url'):
if loc := tag.find('loc'):
if loc.string:
url = {'loc': loc.string.strip()}
if lastmod := tag.find('lastmod'):
try:
t = lastmod.string.strip().rstrip('Z')
url['lastmod'] = (
datetime.fromisoformat(t)
.astimezone(pytz.utc)
.replace(tzinfo=None)
)
except:
pass
if changefreq := tag.find('changefreq'):
url['changefreq'] = changefreq.string.strip()
if priority := tag.find('priority'):
url['priority'] = priority.string.strip()
urls.append(url)
return Sitemap(urls=urls)
def extract_sitemap_paths(
base_url: Optional[str],
urls: list[dict],
) -> tuple[list[tuple[str, bool]], Optional[datetime]]:
"""
Extract essential information from sitemap URLs.
Return a list of relative paths of the site's resources
(in a form to be easily fed into `add_site_paths`) and
the datetime of the latest change.
Relative paths are computed using base_url.
"""
paths = []
latest = None
for url in urls:
loc = url['loc']
lastmod = url.get('lastmod')
if loc.startswith(base_url or ''):
path = loc.removeprefix(base_url or '').lstrip('/')
path = path.split('#', 1)[0]
paths.append((path, True))
if lastmod:
latest = max(lastmod, latest or lastmod)
return paths, latest