atextcrawler/src/atextcrawler/utils/link.py

117 lines
2.0 KiB
Python

"""
Hyperlinks (a href, link).
"""
from pathlib import Path
from typing import Optional
import tldextract
nofollow_link_rels = set(
[
'nofollow',
'search',
'noreferrer',
'noopener',
'help',
'license',
]
)
"""
Do not follow the hrefs in anchor tags with these values of the rel attribute.
"""
meta_names = (
'generator',
'lang',
'language',
'description',
'keywords',
'author',
'title',
'subject',
'revised',
'abstract',
'topic',
'summary',
'classfication',
'category',
'reply-to',
'owner',
'url',
'identifier-URL',
'geo.position',
'geo.region',
'geo.placename',
'dc.language',
)
"""
Values of the name attribute of meta tags to keep.
See also: https://gist.github.com/lancejpollard/1978404
See also: https://github.com/joshbuchea/HEAD
"""
meta_props = (
'og:site_name',
'og:locale',
'og:type',
'og:latitude',
'og:longitude',
'og:street',
'og:locality',
'og:region',
'og:postal',
'og:country',
)
"""
Values of the property attribute of meta tags to keep.
"""
link_rels = set(
[
'webmention',
'pingback',
'alternate',
'canonical',
'author',
]
)
"""
Values of the rel attribute of link tags to keep.
"""
def load_blacklist():
"""
Return the 10000 most popular internet domains.
"""
path = Path(__file__).parent.parent / 'assets' / 'top_1e4'
with open(path, 'r') as file:
domains = file.read().strip().splitlines()
return domains
domain_blacklist = load_blacklist()
def in_blacklist(hostname: str) -> Optional[str]:
"""
Return a match of host in the blacklist, or None.
"""
domain = extract_domain(hostname)
if domain in domain_blacklist:
return hostname
return None
def extract_domain(hostname: str) -> str:
"""
Extract the lower-case domain from a hostname.
"""
levels = tldextract.extract(hostname)
return '.'.join(levels[-2:]).lower()