73 lines
2.3 KiB
Python
73 lines
2.3 KiB
Python
"""
|
|
Seeding of new installations with URLs from blacklists and whitelists.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
|
|
import asyncpg
|
|
|
|
from ..utils.durl import Durl
|
|
|
|
|
|
async def load_seeds(config: dict, pool: asyncpg.Pool) -> None:
|
|
"""
|
|
Add seed file contents (site blacklist and whitelist).
|
|
|
|
If there are sites already, do nothing.
|
|
"""
|
|
async with pool.acquire() as conn:
|
|
site_count = await conn.fetchval("SELECT count(*) FROM site")
|
|
if site_count:
|
|
return
|
|
|
|
# add blacklist entries
|
|
values = []
|
|
blacklist = _load_list(config['config_dir'], 'black')
|
|
for base_url in blacklist:
|
|
durl = await Durl(base_url)
|
|
if durl:
|
|
url = durl.site()
|
|
values.append((url, {'source': 'seed file'}))
|
|
sql = (
|
|
"INSERT INTO site_annotation (base_url, ann_type, ann_content)"
|
|
" VALUES ($1, 'blacklist', $2)"
|
|
)
|
|
await conn.executemany(sql, values)
|
|
|
|
# add whitelist entries
|
|
values1 = []
|
|
values2 = []
|
|
whitelist = _load_list(config['config_dir'], 'white')
|
|
for base_url in whitelist:
|
|
durl = await Durl(base_url)
|
|
if durl:
|
|
url = durl.site()
|
|
if url not in blacklist:
|
|
values1.append((url, {'source': 'seed file'}))
|
|
values2.append((url,))
|
|
sql = (
|
|
"INSERT INTO site_annotation (base_url, ann_type, ann_content)"
|
|
" VALUES ($1, 'whitelist', $2)"
|
|
)
|
|
await conn.executemany(sql, values1)
|
|
sql = "INSERT INTO site_queue (src, url) VALUES (null, $1)"
|
|
await conn.executemany(sql, values2)
|
|
|
|
|
|
def _load_list(config_dir, black_white):
|
|
"""
|
|
Load the seed black or white list.
|
|
"""
|
|
path = Path(config_dir) / 'initial_data' / f'seed_urls.list'
|
|
with open(path, 'r') as list_file:
|
|
urls = []
|
|
for line in list_file.read().strip().splitlines():
|
|
line_ = line.strip()
|
|
if line_.startswith('#'):
|
|
continue
|
|
if black_white == 'black' and line_.startswith('-'):
|
|
urls.append(line_[1:].strip())
|
|
if black_white == 'white' and line_.startswith('+'):
|
|
urls.append(line_[1:].strip())
|
|
return urls
|