150 lines
4.7 KiB
Python
150 lines
4.7 KiB
Python
"""
|
|
Sitemap and SitemapIndex and related operations.
|
|
"""
|
|
|
|
import logging
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
import pytz
|
|
|
|
from ..models import Sitemap, SitemapIndex, TextResource
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def get_sitemap_urls(
|
|
fetcher,
|
|
base_url: Optional[str],
|
|
sitemaps=None,
|
|
) -> list[dict]:
|
|
"""
|
|
Try to find sitemaps and fetch and return their URL content.
|
|
|
|
Each sitemapped URL is a dict with key 'loc' and optional key 'lastmod'.
|
|
"""
|
|
if sitemaps:
|
|
# test example: https://www.berlin.de/
|
|
check_all = True
|
|
elif base_url:
|
|
sitemaps = [
|
|
base_url.rstrip('/') + '/sitemap.xml',
|
|
base_url.rstrip('/') + '/wp-sitemap.xml',
|
|
base_url.rstrip('/') + '/sitemap_index.xml',
|
|
base_url.rstrip('/') + '/sitemap.xml.gz',
|
|
base_url.rstrip('/') + '/sitemap_index.xml.gz',
|
|
base_url.rstrip('/') + '/sitemap.txt',
|
|
base_url.rstrip('/') + '/sitemap/',
|
|
base_url.rstrip('/') + '/sitemap1.xml',
|
|
base_url.rstrip('/') + '/sitemap-index.xml',
|
|
base_url.rstrip('/') + '/sitemapindex.xml',
|
|
base_url.rstrip('/') + '/sitemap/index.xml',
|
|
]
|
|
check_all = False
|
|
else:
|
|
return []
|
|
urls = []
|
|
for sitemap in sitemaps:
|
|
resource = await fetcher.fetch(sitemap)
|
|
found = True
|
|
if isinstance(resource, SitemapIndex):
|
|
for sitemap_ in resource.sitemaps:
|
|
sitemaps.append(sitemap_['loc'])
|
|
elif isinstance(resource, Sitemap):
|
|
urls += resource.urls
|
|
elif isinstance(resource, TextResource) and resource.content_type in (
|
|
'html',
|
|
'plain',
|
|
):
|
|
urls += [
|
|
{'loc': durl.url()}
|
|
for durl in resource.init_fields['links_int']
|
|
]
|
|
else:
|
|
found = False
|
|
if found and not check_all:
|
|
break
|
|
return urls
|
|
|
|
|
|
def parse_sitemapindex(sitemapindex):
|
|
"""
|
|
Parse a sitemap index returning a `SitemapIndex` with found sitemaps.
|
|
"""
|
|
sitemaps = []
|
|
for tag in sitemapindex.find_all('sitemap'):
|
|
if loc := tag.find('loc'):
|
|
if loc.string:
|
|
sitemap = {'loc': loc.string.strip()}
|
|
if lastmod := tag.find('lastmod'):
|
|
try:
|
|
t = datetime.fromisoformat(lastmod.string.strip())
|
|
sitemap['lastmod'] = t
|
|
except:
|
|
pass
|
|
sitemaps.append(sitemap)
|
|
return SitemapIndex(sitemaps=sitemaps)
|
|
|
|
|
|
def parse_sitemap(urlset) -> Sitemap:
|
|
"""
|
|
Return a list of sitemap URLs.
|
|
|
|
Each URL is a dict with these keys+values:
|
|
|
|
* loc: the full URL of a mapped resource
|
|
* lastmod: optional datetime of its last modification
|
|
* changefreq: optional info on the change frequency to be expected
|
|
* priority: optional info on its priority relative to other resources
|
|
|
|
Cf. https://www.sitemaps.org/protocol.html
|
|
"""
|
|
urls = []
|
|
for tag in urlset.find_all('url'):
|
|
if loc := tag.find('loc'):
|
|
if loc.string:
|
|
url = {'loc': loc.string.strip()}
|
|
if lastmod := tag.find('lastmod'):
|
|
try:
|
|
t = lastmod.string.strip().rstrip('Z')
|
|
url['lastmod'] = (
|
|
datetime.fromisoformat(t)
|
|
.astimezone(pytz.utc)
|
|
.replace(tzinfo=None)
|
|
)
|
|
except:
|
|
pass
|
|
if changefreq := tag.find('changefreq'):
|
|
url['changefreq'] = changefreq.string.strip()
|
|
if priority := tag.find('priority'):
|
|
url['priority'] = priority.string.strip()
|
|
urls.append(url)
|
|
return Sitemap(urls=urls)
|
|
|
|
|
|
def extract_sitemap_paths(
|
|
base_url: Optional[str],
|
|
urls: list[dict],
|
|
) -> tuple[list[tuple[str, bool]], Optional[datetime]]:
|
|
"""
|
|
Extract essential information from sitemap URLs.
|
|
|
|
Return a list of relative paths of the site's resources
|
|
(in a form to be easily fed into `add_site_paths`) and
|
|
the datetime of the latest change.
|
|
|
|
Relative paths are computed using base_url.
|
|
"""
|
|
paths = []
|
|
latest = None
|
|
for url in urls:
|
|
loc = url['loc']
|
|
lastmod = url.get('lastmod')
|
|
if loc.startswith(base_url or ''):
|
|
path = loc.removeprefix(base_url or '').lstrip('/')
|
|
path = path.split('#', 1)[0]
|
|
paths.append((path, True))
|
|
if lastmod:
|
|
latest = max(lastmod, latest or lastmod)
|
|
return paths, latest
|