atextcrawler/src/atextcrawler/utils/link.py

"""
Hyperlinks (a href, link).
"""

from pathlib import Path
from typing import Optional

import tldextract

nofollow_link_rels = set(
    [
        'nofollow',
        'search',
        'noreferrer',
        'noopener',
        'help',
        'license',
    ]
)
"""
Do not follow the hrefs in anchor tags with these values of the rel attribute.
"""


meta_names = (
    'generator',
    'lang',
    'language',
    'description',
    'keywords',
    'author',
    'title',
    'subject',
    'revised',
    'abstract',
    'topic',
    'summary',
    'classfication',
    'category',
    'reply-to',
    'owner',
    'url',
    'identifier-URL',
    'geo.position',
    'geo.region',
    'geo.placename',
    'dc.language',
)
"""
Values of the name attribute of meta tags to keep.

See also: https://gist.github.com/lancejpollard/1978404
See also: https://github.com/joshbuchea/HEAD
"""


meta_props = (
    'og:site_name',
    'og:locale',
    'og:type',
    'og:latitude',
    'og:longitude',
    'og:street',
    'og:locality',
    'og:region',
    'og:postal',
    'og:country',
)
"""
Values of the property attribute of meta tags to keep.
"""


link_rels = set(
    [
        'webmention',
        'pingback',
        'alternate',
        'canonical',
        'author',
    ]
)
"""
Values of the rel attribute of link tags to keep.
"""


def load_blacklist():
    """
    Return the 10000 most popular internet domains.
    """
    path = Path(__file__).parent.parent / 'assets' / 'top_1e4'
    with open(path, 'r') as file:
        domains = file.read().strip().splitlines()
    return domains


domain_blacklist = load_blacklist()


def in_blacklist(hostname: str) -> Optional[str]:
    """
    Return a match of host in the blacklist, or None.
    """
    domain = extract_domain(hostname)
    if domain in domain_blacklist:
        return hostname
    return None


def extract_domain(hostname: str) -> str:
    """
    Extract the lower-case domain from a hostname.
    """
    levels = tldextract.extract(hostname)
    return '.'.join(levels[-2:]).lower()