atextcrawler/src/atextcrawler/plugin_defaults/filter_site_path.py

25 lines
638 B
Python

"""
Plugin for filtering paths of a site to be retrieved.
This plugin implements :func:`sp_filter`.
"""
def sp_filter(site, path, robots) -> bool:
"""
Per-site path filter. Return whether the path shall be retrieved.
"""
if not robots.can_fetch_url(site.base_url + path):
return False
if 'amusewiki' in site.meta_info.get('generator', '').lower():
if any(
[
path.endswith(end)
for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
]
):
return False
if '/bbselect?' in path:
return False
return True