59 lines
1.9 KiB
Python
59 lines
1.9 KiB
Python
"""
|
|
Utility functions related to http.
|
|
"""
|
|
|
|
import re
|
|
from typing import Optional
|
|
|
|
from multidict import CIMultiDictProxy
|
|
|
|
from ..models import Site
|
|
from .durl import Durl
|
|
|
|
re_ = {
|
|
'link_header': re.compile(',\s*(?=<)'),
|
|
'rel_canonical': re.compile(';\s*rel\s*=\s*["\']?canonical', re.I),
|
|
'rel_shortlink': re.compile(';\s*rel\s*=\s*["\']?shortlink', re.I),
|
|
}
|
|
|
|
|
|
async def get_header_links(
|
|
headers: CIMultiDictProxy,
|
|
durl: Durl,
|
|
site: Optional[Site],
|
|
) -> dict[str, Optional[str]]:
|
|
"""
|
|
Extract canonical and shortlink links from http headers.
|
|
|
|
*durl* must be the Durl of the fetched page and *site* - i fnon None -
|
|
must be the Site to which the page belongs.
|
|
|
|
Return a (default)dict with 'canonical' and 'shortlink' as keys.
|
|
The values default to None.
|
|
"""
|
|
res = {}
|
|
canonical = shortlink = None
|
|
if 'link' in headers and (link_headers := headers.getall('link')):
|
|
links = []
|
|
for link_header in link_headers:
|
|
links += re_['link_header'].split(link_header)
|
|
url = durl.url()
|
|
base_url = site.base_url if site else url
|
|
base_durl = await Durl(base_url) if base_url else None
|
|
for link in links:
|
|
if not canonical and 'canonical' in link.lower():
|
|
if re_['rel_canonical'].search(link):
|
|
canon_url = link.strip().lstrip('<').split('>')[0]
|
|
if canon_durl := await Durl(canon_url, base=base_durl):
|
|
canonical = canon_durl.url()
|
|
if not shortlink and 'shortlink' in link.lower():
|
|
if re_['rel_shortlink'].search(link):
|
|
short_url = link.strip().lstrip('<').split('>')[0]
|
|
if short_durl := await Durl(short_url, base=base_durl):
|
|
shortlink = short_durl.url()
|
|
if canonical and shortlink:
|
|
break
|
|
res['canonical'] = canonical
|
|
res['shortlink'] = shortlink
|
|
return res
|