279 lines
8.7 KiB
Python
279 lines
8.7 KiB
Python
"""
|
|
Hyperlink parsing.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Optional
|
|
from urllib.parse import urlsplit
|
|
|
|
import tldextract
|
|
from async_dns import types
|
|
from async_dns.resolver import ProxyResolver
|
|
from async_lru import alru_cache
|
|
|
|
from .link import in_blacklist
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
resolver = ProxyResolver(request_timeout=2)
|
|
|
|
|
|
async_dns_logger = logging.getLogger('async_dns')
|
|
async_dns_logger.setLevel(logging.WARNING)
|
|
|
|
|
|
extract = tldextract.TLDExtract(cache_dir=False)
|
|
|
|
|
|
# tldextract uses filelock; set its loglevel to warning
|
|
filelock_logger = logging.getLogger('filelock')
|
|
filelock_logger.setLevel(logging.WARNING)
|
|
|
|
|
|
class Durl:
|
|
"""
|
|
Decomposed URL, contains :class:`urllib.parse.SplitResult`.
|
|
|
|
When constructing this class, it has to be awaited, e.g.:
|
|
|
|
my_durl = await Durl('http://www.example.com/whatever')
|
|
|
|
The given URL will be decomposed, validated and normalized.
|
|
If the URL is invalid, we return None instead of an instance.
|
|
|
|
If the given *base* is None, the URL must be absolute and
|
|
the hostname must be valid (DNS lookup).
|
|
|
|
If the given URL is not absolute, an already decomposed (and thus
|
|
valid) *base* Durl must be given; otherwise the URL is invalid.
|
|
|
|
The *base* Durl can contain a path (but no arguments or fragments),
|
|
in which case the URL - if not absolute - must begin with this path.
|
|
|
|
The scheme must be http or https. If the URL begins with '//',
|
|
'http:' is prepended.
|
|
|
|
If the hostname is longer than 90 characters, the URL is invalid.
|
|
|
|
Default port numbers (80 for http, 443 for https) are removed.
|
|
|
|
The hostname is changed to lower case. Spaces in the hostname
|
|
make the URL invalid.
|
|
|
|
URL fragments are removed.
|
|
"""
|
|
|
|
_url = None
|
|
_base = None
|
|
_match_base = False
|
|
|
|
def __init__(
|
|
self,
|
|
url: str,
|
|
base: Optional['Durl'] = None,
|
|
match_base: bool = False,
|
|
):
|
|
self._url = url
|
|
self._base = base
|
|
self._match_base = match_base
|
|
|
|
def __await__(self):
|
|
return self.__ainit__().__await__()
|
|
|
|
async def __ainit__(self):
|
|
res = None
|
|
try:
|
|
# add missing scheme for urls beginning with '//'
|
|
if self._url.startswith('//'):
|
|
self._url = 'http:' + self._url
|
|
# split the url
|
|
durl = urlsplit(self._url)
|
|
# remove default port numbers 80, 443
|
|
netloc = durl.netloc
|
|
if durl.port == 80 and durl.scheme == 'http':
|
|
netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
|
|
if durl.port == 443 and durl.scheme == 'https':
|
|
netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
|
|
if durl.hostname and durl.hostname != durl.netloc.lower():
|
|
user_pass = ''
|
|
if durl.username and durl.password:
|
|
user_pass = f'{durl.username}:{durl.password}@'
|
|
port = ''
|
|
if durl.port:
|
|
port = f':{durl.port}'
|
|
netloc = f'{user_pass}{durl.hostname.lower()}{port}'
|
|
durl = durl._replace(netloc=netloc)
|
|
|
|
if self._base:
|
|
# if missing fill in scheme and netloc from base
|
|
if not durl.scheme:
|
|
durl = durl._replace(scheme=self._base.scheme)
|
|
if not durl.netloc:
|
|
durl = durl._replace(netloc=self._base.netloc)
|
|
# if match_base, then set res only if the
|
|
# url is compatible with base url
|
|
if not self._match_base:
|
|
res = durl
|
|
else:
|
|
if durl.netloc == self._base.netloc:
|
|
if durl.scheme == self._base.scheme:
|
|
if self._base.path not in ('/', ''):
|
|
if durl.path.startswith(self._base.path):
|
|
res = durl
|
|
else:
|
|
res = durl
|
|
else:
|
|
res = durl
|
|
except:
|
|
logger.exception(
|
|
f'Durl init failed url={self._url}'
|
|
f' base={self._base} match_base={self._match_base}'
|
|
)
|
|
res = None
|
|
if res:
|
|
res = res._replace(fragment='')
|
|
if not res.hostname or len(res.hostname) > 90:
|
|
res = None
|
|
elif res.scheme not in ('https', 'http'):
|
|
res = None
|
|
elif ' ' in res.hostname or '.' not in res.hostname:
|
|
res = None
|
|
elif not (await get_ips(res.hostname)):
|
|
res = None
|
|
elif not res.path.startswith('/'):
|
|
res = res._replace(path='/')
|
|
if res:
|
|
if res.fragment is None:
|
|
res.fragment = ''
|
|
self._durl = res
|
|
return self
|
|
self._durl = None
|
|
|
|
def __getattr__(self, attr):
|
|
return getattr(self._durl, attr)
|
|
|
|
def url(self) -> str:
|
|
"""
|
|
Return the URL as string.
|
|
"""
|
|
return self._durl.geturl()
|
|
|
|
def pwa(self) -> str:
|
|
"""
|
|
Return the (base-relative) path with args of the Durl.
|
|
"""
|
|
if self._base and self._match_base:
|
|
path = self._durl.path.removeprefix(self._base.path)
|
|
else:
|
|
path = self._durl.path
|
|
qs = f'?{self._durl.query}' if self._durl.query else ''
|
|
return f'{path}{qs}'.lstrip('/')
|
|
|
|
def has_path(self) -> bool:
|
|
"""
|
|
Return whether the Durl has a non-trivil path.
|
|
"""
|
|
return self._durl.path not in ('/', '')
|
|
|
|
def site(self) -> str:
|
|
"""
|
|
Return the site (base_url).
|
|
"""
|
|
return f'{self._durl.scheme}://{self._durl.netloc}/'
|
|
|
|
def domain(self) -> str:
|
|
"""
|
|
Return the domain of the Durl (wrong in case of second-level domains).
|
|
"""
|
|
levels = extract(self._durl.hostname)
|
|
return '.'.join(levels[-2:]).lower()
|
|
|
|
def replace_scheme(self, scheme: str) -> None:
|
|
"""
|
|
Replace the scheme (must be 'http' or 'https').
|
|
"""
|
|
self._durl = self._durl._replace(scheme=scheme)
|
|
|
|
|
|
@alru_cache(maxsize=1000)
|
|
async def get_ips(hostname: str) -> set[str]:
|
|
"""
|
|
Return IPv4 and IPv6 addresses of the given hostname.
|
|
"""
|
|
ips = set()
|
|
for type_ in (types.A, types.AAAA):
|
|
try:
|
|
res, cached = await resolver.query(hostname, type_)
|
|
if res:
|
|
if addr := res.get_record([type_]):
|
|
ips.add(addr.data)
|
|
except:
|
|
pass
|
|
return ips
|
|
|
|
|
|
def get_url_variants(url: str) -> list[str]:
|
|
"""
|
|
Return variants of the URL.
|
|
|
|
Replace http with https and vice versa;
|
|
prepend or remove 'www.' to or from the beginning of the hostname.
|
|
"""
|
|
if url.startswith('http://www.'):
|
|
s = url.removeprefix('http://www.')
|
|
return [url, f'http://{s}', f'https://www.{s}', f'https://{s}']
|
|
elif url.startswith('http://'):
|
|
s = url.removeprefix('http://')
|
|
return [url, f'http://www.{s}', f'https://www.{s}', f'https://{s}']
|
|
elif url.startswith('https://www.'):
|
|
s = url.removeprefix('https://www.')
|
|
return [url, f'https://{s}', f'http://www.{s}', f'http://{s}']
|
|
elif url.startswith('https://'):
|
|
s = url.removeprefix('https://')
|
|
return [url, f'https://www.{s}', f'http://www.{s}', f'http://{s}']
|
|
else:
|
|
return [url]
|
|
|
|
|
|
async def assort_links(
|
|
links: dict[str, tuple[int, int, list[str]]],
|
|
durl: Durl,
|
|
text: str,
|
|
base_url: str = None,
|
|
) -> tuple[
|
|
dict[str, tuple[int, int, list[str]]],
|
|
dict[Durl, tuple[list[str], str]],
|
|
dict[Durl, tuple[list[str], str]],
|
|
]:
|
|
"""
|
|
Sort links into a cleaned, an internal and an external dict.
|
|
|
|
The cleaned dict maps absolute URLs to char ranges and relations.
|
|
The internal dict maps absolute URLs to relations and the linked text.
|
|
The external dict maps absolute URLs to relations and the linked text.
|
|
The relations are link relations, e.g. rel="canonical".
|
|
|
|
The base_url is set, it is used to distinguish internal and external
|
|
links. If it is not set, the base_url is obtained from *durl*.
|
|
"""
|
|
res_int = {}
|
|
res_ext = {}
|
|
if not base_url:
|
|
base_url = durl.site().lower()
|
|
base_durl = await Durl(base_url)
|
|
cleaned_links = {}
|
|
for href, (i, f, rel) in links.items():
|
|
durl = await Durl(href, base=base_durl)
|
|
if not durl:
|
|
continue
|
|
if durl.hostname and in_blacklist(durl.hostname):
|
|
continue
|
|
cleaned_links[durl.url()] = i, f, rel
|
|
txt = text[i:f]
|
|
if durl.site().lower() == base_url:
|
|
res_int[durl] = rel, txt
|
|
else:
|
|
res_ext[durl] = rel, txt
|
|
return cleaned_links, res_int, res_ext
|