atextcrawler/src/atextcrawler/resource/sitemap.py

150 lines
4.7 KiB
Python
Raw Normal View History

2021-11-29 09:16:31 +00:00
"""
Sitemap and SitemapIndex and related operations.
"""
import logging
from datetime import datetime
from typing import Optional
import pytz
from ..models import Sitemap, SitemapIndex, TextResource
logger = logging.getLogger(__name__)
async def get_sitemap_urls(
fetcher,
base_url: Optional[str],
sitemaps=None,
) -> list[dict]:
"""
Try to find sitemaps and fetch and return their URL content.
Each sitemapped URL is a dict with key 'loc' and optional key 'lastmod'.
"""
if sitemaps:
# test example: https://www.berlin.de/
check_all = True
elif base_url:
sitemaps = [
base_url.rstrip('/') + '/sitemap.xml',
base_url.rstrip('/') + '/wp-sitemap.xml',
base_url.rstrip('/') + '/sitemap_index.xml',
base_url.rstrip('/') + '/sitemap.xml.gz',
base_url.rstrip('/') + '/sitemap_index.xml.gz',
base_url.rstrip('/') + '/sitemap.txt',
base_url.rstrip('/') + '/sitemap/',
base_url.rstrip('/') + '/sitemap1.xml',
base_url.rstrip('/') + '/sitemap-index.xml',
base_url.rstrip('/') + '/sitemapindex.xml',
base_url.rstrip('/') + '/sitemap/index.xml',
]
check_all = False
else:
return []
urls = []
for sitemap in sitemaps:
resource = await fetcher.fetch(sitemap)
found = True
if isinstance(resource, SitemapIndex):
for sitemap_ in resource.sitemaps:
sitemaps.append(sitemap_['loc'])
elif isinstance(resource, Sitemap):
urls += resource.urls
elif isinstance(resource, TextResource) and resource.content_type in (
'html',
'plain',
):
urls += [
{'loc': durl.url()}
for durl in resource.init_fields['links_int']
]
else:
found = False
if found and not check_all:
break
return urls
def parse_sitemapindex(sitemapindex):
"""
Parse a sitemap index returning a `SitemapIndex` with found sitemaps.
"""
sitemaps = []
for tag in sitemapindex.find_all('sitemap'):
if loc := tag.find('loc'):
if loc.string:
sitemap = {'loc': loc.string.strip()}
if lastmod := tag.find('lastmod'):
try:
t = datetime.fromisoformat(lastmod.string.strip())
sitemap['lastmod'] = t
except:
pass
sitemaps.append(sitemap)
return SitemapIndex(sitemaps=sitemaps)
def parse_sitemap(urlset) -> Sitemap:
"""
Return a list of sitemap URLs.
Each URL is a dict with these keys+values:
* loc: the full URL of a mapped resource
* lastmod: optional datetime of its last modification
* changefreq: optional info on the change frequency to be expected
* priority: optional info on its priority relative to other resources
Cf. https://www.sitemaps.org/protocol.html
"""
urls = []
for tag in urlset.find_all('url'):
if loc := tag.find('loc'):
if loc.string:
url = {'loc': loc.string.strip()}
if lastmod := tag.find('lastmod'):
try:
t = lastmod.string.strip().rstrip('Z')
url['lastmod'] = (
datetime.fromisoformat(t)
.astimezone(pytz.utc)
.replace(tzinfo=None)
)
except:
pass
if changefreq := tag.find('changefreq'):
url['changefreq'] = changefreq.string.strip()
if priority := tag.find('priority'):
url['priority'] = priority.string.strip()
urls.append(url)
return Sitemap(urls=urls)
def extract_sitemap_paths(
base_url: Optional[str],
urls: list[dict],
) -> tuple[list[tuple[str, bool]], Optional[datetime]]:
"""
Extract essential information from sitemap URLs.
Return a list of relative paths of the site's resources
(in a form to be easily fed into `add_site_paths`) and
the datetime of the latest change.
Relative paths are computed using base_url.
"""
paths = []
latest = None
for url in urls:
loc = url['loc']
lastmod = url.get('lastmod')
if loc.startswith(base_url or ''):
path = loc.removeprefix(base_url or '').lstrip('/')
path = path.split('#', 1)[0]
paths.append((path, True))
if lastmod:
latest = max(lastmod, latest or lastmod)
return paths, latest