117 lines
2.0 KiB
Python
117 lines
2.0 KiB
Python
"""
|
|
Hyperlinks (a href, link).
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import tldextract
|
|
|
|
nofollow_link_rels = set(
|
|
[
|
|
'nofollow',
|
|
'search',
|
|
'noreferrer',
|
|
'noopener',
|
|
'help',
|
|
'license',
|
|
]
|
|
)
|
|
"""
|
|
Do not follow the hrefs in anchor tags with these values of the rel attribute.
|
|
"""
|
|
|
|
|
|
meta_names = (
|
|
'generator',
|
|
'lang',
|
|
'language',
|
|
'description',
|
|
'keywords',
|
|
'author',
|
|
'title',
|
|
'subject',
|
|
'revised',
|
|
'abstract',
|
|
'topic',
|
|
'summary',
|
|
'classfication',
|
|
'category',
|
|
'reply-to',
|
|
'owner',
|
|
'url',
|
|
'identifier-URL',
|
|
'geo.position',
|
|
'geo.region',
|
|
'geo.placename',
|
|
'dc.language',
|
|
)
|
|
"""
|
|
Values of the name attribute of meta tags to keep.
|
|
|
|
See also: https://gist.github.com/lancejpollard/1978404
|
|
See also: https://github.com/joshbuchea/HEAD
|
|
"""
|
|
|
|
|
|
meta_props = (
|
|
'og:site_name',
|
|
'og:locale',
|
|
'og:type',
|
|
'og:latitude',
|
|
'og:longitude',
|
|
'og:street',
|
|
'og:locality',
|
|
'og:region',
|
|
'og:postal',
|
|
'og:country',
|
|
)
|
|
"""
|
|
Values of the property attribute of meta tags to keep.
|
|
"""
|
|
|
|
|
|
link_rels = set(
|
|
[
|
|
'webmention',
|
|
'pingback',
|
|
'alternate',
|
|
'canonical',
|
|
'author',
|
|
]
|
|
)
|
|
"""
|
|
Values of the rel attribute of link tags to keep.
|
|
"""
|
|
|
|
|
|
def load_blacklist():
|
|
"""
|
|
Return the 10000 most popular internet domains.
|
|
"""
|
|
path = Path(__file__).parent.parent / 'assets' / 'top_1e4'
|
|
with open(path, 'r') as file:
|
|
domains = file.read().strip().splitlines()
|
|
return domains
|
|
|
|
|
|
domain_blacklist = load_blacklist()
|
|
|
|
|
|
def in_blacklist(hostname: str) -> Optional[str]:
|
|
"""
|
|
Return a match of host in the blacklist, or None.
|
|
"""
|
|
domain = extract_domain(hostname)
|
|
if domain in domain_blacklist:
|
|
return hostname
|
|
return None
|
|
|
|
|
|
def extract_domain(hostname: str) -> str:
|
|
"""
|
|
Extract the lower-case domain from a hostname.
|
|
"""
|
|
levels = tldextract.extract(hostname)
|
|
return '.'.join(levels[-2:]).lower()
|