atextcrawler/src/atextcrawler/site/seed.py

73 lines
2.3 KiB
Python

"""
Seeding of new installations with URLs from blacklists and whitelists.
"""
from pathlib import Path
import asyncpg
from ..utils.durl import Durl
async def load_seeds(config: dict, pool: asyncpg.Pool) -> None:
"""
Add seed file contents (site blacklist and whitelist).
If there are sites already, do nothing.
"""
async with pool.acquire() as conn:
site_count = await conn.fetchval("SELECT count(*) FROM site")
if site_count:
return
# add blacklist entries
values = []
blacklist = _load_list(config['config_dir'], 'black')
for base_url in blacklist:
durl = await Durl(base_url)
if durl:
url = durl.site()
values.append((url, {'source': 'seed file'}))
sql = (
"INSERT INTO site_annotation (base_url, ann_type, ann_content)"
" VALUES ($1, 'blacklist', $2)"
)
await conn.executemany(sql, values)
# add whitelist entries
values1 = []
values2 = []
whitelist = _load_list(config['config_dir'], 'white')
for base_url in whitelist:
durl = await Durl(base_url)
if durl:
url = durl.site()
if url not in blacklist:
values1.append((url, {'source': 'seed file'}))
values2.append((url,))
sql = (
"INSERT INTO site_annotation (base_url, ann_type, ann_content)"
" VALUES ($1, 'whitelist', $2)"
)
await conn.executemany(sql, values1)
sql = "INSERT INTO site_queue (src, url) VALUES (null, $1)"
await conn.executemany(sql, values2)
def _load_list(config_dir, black_white):
"""
Load the seed black or white list.
"""
path = Path(config_dir) / 'initial_data' / f'seed_urls.list'
with open(path, 'r') as list_file:
urls = []
for line in list_file.read().strip().splitlines():
line_ = line.strip()
if line_.startswith('#'):
continue
if black_white == 'black' and line_.startswith('-'):
urls.append(line_[1:].strip())
if black_white == 'white' and line_.startswith('+'):
urls.append(line_[1:].strip())
return urls