atextcrawler/src/atextcrawler/utils/durl.py

279 lines
8.7 KiB
Python

"""
Hyperlink parsing.
"""
import logging
from typing import Optional
from urllib.parse import urlsplit
import tldextract
from async_dns import types
from async_dns.resolver import ProxyResolver
from async_lru import alru_cache
from .link import in_blacklist
logger = logging.getLogger(__name__)
resolver = ProxyResolver(request_timeout=2)
async_dns_logger = logging.getLogger('async_dns')
async_dns_logger.setLevel(logging.WARNING)
extract = tldextract.TLDExtract(cache_dir=False)
# tldextract uses filelock; set its loglevel to warning
filelock_logger = logging.getLogger('filelock')
filelock_logger.setLevel(logging.WARNING)
class Durl:
"""
Decomposed URL, contains :class:`urllib.parse.SplitResult`.
When constructing this class, it has to be awaited, e.g.:
my_durl = await Durl('http://www.example.com/whatever')
The given URL will be decomposed, validated and normalized.
If the URL is invalid, we return None instead of an instance.
If the given *base* is None, the URL must be absolute and
the hostname must be valid (DNS lookup).
If the given URL is not absolute, an already decomposed (and thus
valid) *base* Durl must be given; otherwise the URL is invalid.
The *base* Durl can contain a path (but no arguments or fragments),
in which case the URL - if not absolute - must begin with this path.
The scheme must be http or https. If the URL begins with '//',
'http:' is prepended.
If the hostname is longer than 90 characters, the URL is invalid.
Default port numbers (80 for http, 443 for https) are removed.
The hostname is changed to lower case. Spaces in the hostname
make the URL invalid.
URL fragments are removed.
"""
_url = None
_base = None
_match_base = False
def __init__(
self,
url: str,
base: Optional['Durl'] = None,
match_base: bool = False,
):
self._url = url
self._base = base
self._match_base = match_base
def __await__(self):
return self.__ainit__().__await__()
async def __ainit__(self):
res = None
try:
# add missing scheme for urls beginning with '//'
if self._url.startswith('//'):
self._url = 'http:' + self._url
# split the url
durl = urlsplit(self._url)
# remove default port numbers 80, 443
netloc = durl.netloc
if durl.port == 80 and durl.scheme == 'http':
netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
if durl.port == 443 and durl.scheme == 'https':
netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
if durl.hostname and durl.hostname != durl.netloc.lower():
user_pass = ''
if durl.username and durl.password:
user_pass = f'{durl.username}:{durl.password}@'
port = ''
if durl.port:
port = f':{durl.port}'
netloc = f'{user_pass}{durl.hostname.lower()}{port}'
durl = durl._replace(netloc=netloc)
if self._base:
# if missing fill in scheme and netloc from base
if not durl.scheme:
durl = durl._replace(scheme=self._base.scheme)
if not durl.netloc:
durl = durl._replace(netloc=self._base.netloc)
# if match_base, then set res only if the
# url is compatible with base url
if not self._match_base:
res = durl
else:
if durl.netloc == self._base.netloc:
if durl.scheme == self._base.scheme:
if self._base.path not in ('/', ''):
if durl.path.startswith(self._base.path):
res = durl
else:
res = durl
else:
res = durl
except:
logger.exception(
f'Durl init failed url={self._url}'
f' base={self._base} match_base={self._match_base}'
)
res = None
if res:
res = res._replace(fragment='')
if not res.hostname or len(res.hostname) > 90:
res = None
elif res.scheme not in ('https', 'http'):
res = None
elif ' ' in res.hostname or '.' not in res.hostname:
res = None
elif not (await get_ips(res.hostname)):
res = None
elif not res.path.startswith('/'):
res = res._replace(path='/')
if res:
if res.fragment is None:
res.fragment = ''
self._durl = res
return self
self._durl = None
def __getattr__(self, attr):
return getattr(self._durl, attr)
def url(self) -> str:
"""
Return the URL as string.
"""
return self._durl.geturl()
def pwa(self) -> str:
"""
Return the (base-relative) path with args of the Durl.
"""
if self._base and self._match_base:
path = self._durl.path.removeprefix(self._base.path)
else:
path = self._durl.path
qs = f'?{self._durl.query}' if self._durl.query else ''
return f'{path}{qs}'.lstrip('/')
def has_path(self) -> bool:
"""
Return whether the Durl has a non-trivil path.
"""
return self._durl.path not in ('/', '')
def site(self) -> str:
"""
Return the site (base_url).
"""
return f'{self._durl.scheme}://{self._durl.netloc}/'
def domain(self) -> str:
"""
Return the domain of the Durl (wrong in case of second-level domains).
"""
levels = extract(self._durl.hostname)
return '.'.join(levels[-2:]).lower()
def replace_scheme(self, scheme: str) -> None:
"""
Replace the scheme (must be 'http' or 'https').
"""
self._durl = self._durl._replace(scheme=scheme)
@alru_cache(maxsize=1000)
async def get_ips(hostname: str) -> set[str]:
"""
Return IPv4 and IPv6 addresses of the given hostname.
"""
ips = set()
for type_ in (types.A, types.AAAA):
try:
res, cached = await resolver.query(hostname, type_)
if res:
if addr := res.get_record([type_]):
ips.add(addr.data)
except:
pass
return ips
def get_url_variants(url: str) -> list[str]:
"""
Return variants of the URL.
Replace http with https and vice versa;
prepend or remove 'www.' to or from the beginning of the hostname.
"""
if url.startswith('http://www.'):
s = url.removeprefix('http://www.')
return [url, f'http://{s}', f'https://www.{s}', f'https://{s}']
elif url.startswith('http://'):
s = url.removeprefix('http://')
return [url, f'http://www.{s}', f'https://www.{s}', f'https://{s}']
elif url.startswith('https://www.'):
s = url.removeprefix('https://www.')
return [url, f'https://{s}', f'http://www.{s}', f'http://{s}']
elif url.startswith('https://'):
s = url.removeprefix('https://')
return [url, f'https://www.{s}', f'http://www.{s}', f'http://{s}']
else:
return [url]
async def assort_links(
links: dict[str, tuple[int, int, list[str]]],
durl: Durl,
text: str,
base_url: str = None,
) -> tuple[
dict[str, tuple[int, int, list[str]]],
dict[Durl, tuple[list[str], str]],
dict[Durl, tuple[list[str], str]],
]:
"""
Sort links into a cleaned, an internal and an external dict.
The cleaned dict maps absolute URLs to char ranges and relations.
The internal dict maps absolute URLs to relations and the linked text.
The external dict maps absolute URLs to relations and the linked text.
The relations are link relations, e.g. rel="canonical".
The base_url is set, it is used to distinguish internal and external
links. If it is not set, the base_url is obtained from *durl*.
"""
res_int = {}
res_ext = {}
if not base_url:
base_url = durl.site().lower()
base_durl = await Durl(base_url)
cleaned_links = {}
for href, (i, f, rel) in links.items():
durl = await Durl(href, base=base_durl)
if not durl:
continue
if durl.hostname and in_blacklist(durl.hostname):
continue
cleaned_links[durl.url()] = i, f, rel
txt = text[i:f]
if durl.site().lower() == base_url:
res_int[durl] = rel, txt
else:
res_ext[durl] = rel, txt
return cleaned_links, res_int, res_ext