atextcrawler/src/atextcrawler/utils/durl.py

"""
Hyperlink parsing.
"""

import logging
from typing import Optional
from urllib.parse import urlsplit

import tldextract
from async_dns import types
from async_dns.resolver import ProxyResolver
from async_lru import alru_cache

from .link import in_blacklist

logger = logging.getLogger(__name__)


resolver = ProxyResolver(request_timeout=2)


async_dns_logger = logging.getLogger('async_dns')
async_dns_logger.setLevel(logging.WARNING)


extract = tldextract.TLDExtract(cache_dir=False)


# tldextract uses filelock; set its loglevel to warning
filelock_logger = logging.getLogger('filelock')
filelock_logger.setLevel(logging.WARNING)


class Durl:
    """
    Decomposed URL, contains :class:`urllib.parse.SplitResult`.

    When constructing this class, it has to be awaited, e.g.:

         my_durl = await Durl('http://www.example.com/whatever')

    The given URL will be decomposed, validated and normalized.
    If the URL is invalid, we return None instead of an instance.

    If the given *base* is None, the URL must be absolute and
    the hostname must be valid (DNS lookup).

    If the given URL is not absolute, an already decomposed (and thus
    valid) *base* Durl must be given; otherwise the URL is invalid.

    The *base* Durl can contain a path (but no arguments or fragments),
    in which case the URL - if not absolute - must begin with this path.

    The scheme must be http or https. If the URL begins with '//',
    'http:' is prepended.

    If the hostname is longer than 90 characters, the URL is invalid.

    Default port numbers (80 for http, 443 for https) are removed.

    The hostname is changed to lower case. Spaces in the hostname
    make the URL invalid.

    URL fragments are removed.
    """

    _url = None
    _base = None
    _match_base = False

    def __init__(
        self,
        url: str,
        base: Optional['Durl'] = None,
        match_base: bool = False,
    ):
        self._url = url
        self._base = base
        self._match_base = match_base

    def __await__(self):
        return self.__ainit__().__await__()

    async def __ainit__(self):
        res = None
        try:
            # add missing scheme for urls beginning with '//'
            if self._url.startswith('//'):
                self._url = 'http:' + self._url
            # split the url
            durl = urlsplit(self._url)
            # remove default port numbers 80, 443
            netloc = durl.netloc
            if durl.port == 80 and durl.scheme == 'http':
                netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
            if durl.port == 443 and durl.scheme == 'https':
                netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
            if durl.hostname and durl.hostname != durl.netloc.lower():
                user_pass = ''
                if durl.username and durl.password:
                    user_pass = f'{durl.username}:{durl.password}@'
                port = ''
                if durl.port:
                    port = f':{durl.port}'
                netloc = f'{user_pass}{durl.hostname.lower()}{port}'
            durl = durl._replace(netloc=netloc)

            if self._base:
                # if missing fill in scheme and netloc from base
                if not durl.scheme:
                    durl = durl._replace(scheme=self._base.scheme)
                if not durl.netloc:
                    durl = durl._replace(netloc=self._base.netloc)
                # if match_base, then set res only if the
                # url is compatible with base url
                if not self._match_base:
                    res = durl
                else:
                    if durl.netloc == self._base.netloc:
                        if durl.scheme == self._base.scheme:
                            if self._base.path not in ('/', ''):
                                if durl.path.startswith(self._base.path):
                                    res = durl
                            else:
                                res = durl
            else:
                res = durl
        except:
            logger.exception(
                f'Durl init failed url={self._url}'
                f' base={self._base} match_base={self._match_base}'
            )
            res = None
        if res:
            res = res._replace(fragment='')
            if not res.hostname or len(res.hostname) > 90:
                res = None
            elif res.scheme not in ('https', 'http'):
                res = None
            elif ' ' in res.hostname or '.' not in res.hostname:
                res = None
            elif not (await get_ips(res.hostname)):
                res = None
            elif not res.path.startswith('/'):
                res = res._replace(path='/')
        if res:
            if res.fragment is None:
                res.fragment = ''
            self._durl = res
            return self
        self._durl = None

    def __getattr__(self, attr):
        return getattr(self._durl, attr)

    def url(self) -> str:
        """
        Return the URL as string.
        """
        return self._durl.geturl()

    def pwa(self) -> str:
        """
        Return the (base-relative) path with args of the Durl.
        """
        if self._base and self._match_base:
            path = self._durl.path.removeprefix(self._base.path)
        else:
            path = self._durl.path
        qs = f'?{self._durl.query}' if self._durl.query else ''
        return f'{path}{qs}'.lstrip('/')

    def has_path(self) -> bool:
        """
        Return whether the Durl has a non-trivil path.
        """
        return self._durl.path not in ('/', '')

    def site(self) -> str:
        """
        Return the site (base_url).
        """
        return f'{self._durl.scheme}://{self._durl.netloc}/'

    def domain(self) -> str:
        """
        Return the domain of the Durl (wrong in case of second-level domains).
        """
        levels = extract(self._durl.hostname)
        return '.'.join(levels[-2:]).lower()

    def replace_scheme(self, scheme: str) -> None:
        """
        Replace the scheme (must be 'http' or 'https').
        """
        self._durl = self._durl._replace(scheme=scheme)


@alru_cache(maxsize=1000)
async def get_ips(hostname: str) -> set[str]:
    """
    Return IPv4 and IPv6 addresses of the given hostname.
    """
    ips = set()
    for type_ in (types.A, types.AAAA):
        try:
            res, cached = await resolver.query(hostname, type_)
            if res:
                if addr := res.get_record([type_]):
                    ips.add(addr.data)
        except:
            pass
    return ips


def get_url_variants(url: str) -> list[str]:
    """
    Return variants of the URL.

    Replace http with https and vice versa;
    prepend or remove 'www.' to or from the beginning of the hostname.
    """
    if url.startswith('http://www.'):
        s = url.removeprefix('http://www.')
        return [url, f'http://{s}', f'https://www.{s}', f'https://{s}']
    elif url.startswith('http://'):
        s = url.removeprefix('http://')
        return [url, f'http://www.{s}', f'https://www.{s}', f'https://{s}']
    elif url.startswith('https://www.'):
        s = url.removeprefix('https://www.')
        return [url, f'https://{s}', f'http://www.{s}', f'http://{s}']
    elif url.startswith('https://'):
        s = url.removeprefix('https://')
        return [url, f'https://www.{s}', f'http://www.{s}', f'http://{s}']
    else:
        return [url]


async def assort_links(
    links: dict[str, tuple[int, int, list[str]]],
    durl: Durl,
    text: str,
    base_url: str = None,
) -> tuple[
    dict[str, tuple[int, int, list[str]]],
    dict[Durl, tuple[list[str], str]],
    dict[Durl, tuple[list[str], str]],
]:
    """
    Sort links into a cleaned, an internal and an external dict.

    The cleaned dict maps absolute URLs to char ranges and relations.
    The internal dict maps absolute URLs to relations and the linked text.
    The external dict maps absolute URLs to relations and the linked text.
    The relations are link relations, e.g. rel="canonical".

    The base_url is set, it is used to distinguish internal and external
    links. If it is not set, the base_url is obtained from *durl*.
    """
    res_int = {}
    res_ext = {}
    if not base_url:
        base_url = durl.site().lower()
    base_durl = await Durl(base_url)
    cleaned_links = {}
    for href, (i, f, rel) in links.items():
        durl = await Durl(href, base=base_durl)
        if not durl:
            continue
        if durl.hostname and in_blacklist(durl.hostname):
            continue
        cleaned_links[durl.url()] = i, f, rel
        txt = text[i:f]
        if durl.site().lower() == base_url:
            res_int[durl] = rel, txt
        else:
            res_ext[durl] = rel, txt
    return cleaned_links, res_int, res_ext