atextcrawler/src/atextcrawler/site/queue.py

128 lines
4.2 KiB
Python

"""
Queue of sites.
When processing a resource, its external links are put into database table
`site_queue`.
The items in `site_queue` are processed in :func:`process_site_queue`.
This is done baseURL by baseURL (see :func:`iter_site_queue`).
While doing this, cross-site links are put into table `site_link`.
"""
import logging
from typing import AsyncIterator, Optional
import aiohttp
from asyncpg import Connection
from ..resource import ResourceFetcher
from .operations import update_site
logger = logging.getLogger(__name__)
async def process_site_queue(app, pool):
"""
Loop over queued sites creating new sites and adding cross-site links.
"""
site_delay = app.config['crawl']['site_delay']
resource_delay = app.config['crawl']['resource_delay']
async with pool.acquire() as conn:
async with aiohttp.ClientSession() as session:
fetcher = ResourceFetcher(session)
while app.running:
async for base_url, links_from in iter_site_queue(app, conn):
# get or create site
msg = f'Site queue: updating {base_url}'
logger.debug(msg)
site, created = await update_site(
app, fetcher, conn, base_url
)
if site:
await store_incoming_site_site_links(
conn, site.id_, links_from
)
# delete handled queue items
sql = "DELETE FROM site_queue WHERE url=$1"
await conn.execute(sql, base_url)
await app.sleep(resource_delay)
logger.debug(
f'Queued sites exhausted, sleeping'
f' for {site_delay} seconds'
)
await app.sleep(site_delay)
async def iter_site_queue(
app, conn: Connection
) -> AsyncIterator[tuple[str, dict[int, str]]]:
"""
Yield URLs with aggregated link information from site_queue.
Yield a URL and a dict mapping ids of linking sites to link texts.
"""
site_revisit_interval = app.config['crawl']['site_revisit_interval']
while app.running:
sql = (
"SELECT url, array_agg(src) srcs,"
" array_agg(link_text) link_texts"
" FROM site_queue GROUP BY url LIMIT 1"
)
row = await conn.fetchrow(sql)
if row:
base_url = row['url']
links_from = {}
srcs = row['srcs']
link_texts = row['link_texts']
for i in range(len(srcs)):
if src := srcs[i]:
links_from[src] = link_texts[i]
if site_id := await site_recently_updated(
conn, base_url, site_revisit_interval
):
# just store incoming links and remove the site from the queue
await store_incoming_site_site_links(conn, site_id, links_from)
sql = "DELETE FROM site_queue WHERE url=$1"
await conn.execute(sql, base_url)
else:
yield base_url, links_from
else:
break
async def site_recently_updated(
conn: Connection,
base_url: str,
site_revisit_interval: float,
) -> Optional[int]:
"""
Return the id of the site with given base_url if it was updated recently.
"""
sql = (
f"SELECT id FROM site WHERE $1=any(base_urls)"
f" AND last_update + interval '{site_revisit_interval} seconds'"
f" > now() at time zone 'utc' LIMIT 1"
)
site_id = await conn.fetchval(sql, base_url)
return site_id
async def store_incoming_site_site_links(
conn: Connection, site_id: int, links_from: dict
):
"""
Store incoming site-site links (irrespective of crawl_enabled).
*site_id* is the id of the site to which the links in *links_from* point.
"""
sql = (
"INSERT INTO site_link"
" (src, dst, link_text) VALUES ($1, $2, $3)"
" ON CONFLICT (src, dst) DO NOTHING"
)
values = [
(from_id, site_id, link_text)
for from_id, link_text in links_from.items()
if from_id != site_id
]
await conn.executemany(sql, values)