atextcrawler/src/atextcrawler/utils/http.py

59 lines
1.9 KiB
Python

"""
Utility functions related to http.
"""
import re
from typing import Optional
from multidict import CIMultiDictProxy
from ..models import Site
from .durl import Durl
re_ = {
'link_header': re.compile(',\s*(?=<)'),
'rel_canonical': re.compile(';\s*rel\s*=\s*["\']?canonical', re.I),
'rel_shortlink': re.compile(';\s*rel\s*=\s*["\']?shortlink', re.I),
}
async def get_header_links(
headers: CIMultiDictProxy,
durl: Durl,
site: Optional[Site],
) -> dict[str, Optional[str]]:
"""
Extract canonical and shortlink links from http headers.
*durl* must be the Durl of the fetched page and *site* - i fnon None -
must be the Site to which the page belongs.
Return a (default)dict with 'canonical' and 'shortlink' as keys.
The values default to None.
"""
res = {}
canonical = shortlink = None
if 'link' in headers and (link_headers := headers.getall('link')):
links = []
for link_header in link_headers:
links += re_['link_header'].split(link_header)
url = durl.url()
base_url = site.base_url if site else url
base_durl = await Durl(base_url) if base_url else None
for link in links:
if not canonical and 'canonical' in link.lower():
if re_['rel_canonical'].search(link):
canon_url = link.strip().lstrip('<').split('>')[0]
if canon_durl := await Durl(canon_url, base=base_durl):
canonical = canon_durl.url()
if not shortlink and 'shortlink' in link.lower():
if re_['rel_shortlink'].search(link):
short_url = link.strip().lstrip('<').split('>')[0]
if short_durl := await Durl(short_url, base=base_durl):
shortlink = short_durl.url()
if canonical and shortlink:
break
res['canonical'] = canonical
res['shortlink'] = shortlink
return res