""" Seeding of new installations with URLs from blacklists and whitelists. """ from pathlib import Path import asyncpg from ..utils.durl import Durl async def load_seeds(config: dict, pool: asyncpg.Pool) -> None: """ Add seed file contents (site blacklist and whitelist). If there are sites already, do nothing. """ async with pool.acquire() as conn: site_count = await conn.fetchval("SELECT count(*) FROM site") if site_count: return # add blacklist entries values = [] blacklist = _load_list(config['config_dir'], 'black') for base_url in blacklist: durl = await Durl(base_url) if durl: url = durl.site() values.append((url, {'source': 'seed file'})) sql = ( "INSERT INTO site_annotation (base_url, ann_type, ann_content)" " VALUES ($1, 'blacklist', $2)" ) await conn.executemany(sql, values) # add whitelist entries values1 = [] values2 = [] whitelist = _load_list(config['config_dir'], 'white') for base_url in whitelist: durl = await Durl(base_url) if durl: url = durl.site() if url not in blacklist: values1.append((url, {'source': 'seed file'})) values2.append((url,)) sql = ( "INSERT INTO site_annotation (base_url, ann_type, ann_content)" " VALUES ($1, 'whitelist', $2)" ) await conn.executemany(sql, values1) sql = "INSERT INTO site_queue (src, url) VALUES (null, $1)" await conn.executemany(sql, values2) def _load_list(config_dir, black_white): """ Load the seed black or white list. """ path = Path(config_dir) / 'initial_data' / f'seed_urls.list' with open(path, 'r') as list_file: urls = [] for line in list_file.read().strip().splitlines(): line_ = line.strip() if line_.startswith('#'): continue if black_white == 'black' and line_.startswith('-'): urls.append(line_[1:].strip()) if black_white == 'white' and line_.startswith('+'): urls.append(line_[1:].strip()) return urls