atextcrawler/src/atextcrawler/site/queue.py

"""
Queue of sites.

When processing a resource, its external links are put into database table
`site_queue`.
The items in `site_queue` are processed in :func:`process_site_queue`.
This is done baseURL by baseURL (see :func:`iter_site_queue`).
While doing this, cross-site links are put into table `site_link`.
"""

import logging
from typing import AsyncIterator, Optional

import aiohttp
from asyncpg import Connection

from ..resource import ResourceFetcher
from .operations import update_site

logger = logging.getLogger(__name__)


async def process_site_queue(app, pool):
    """
    Loop over queued sites creating new sites and adding cross-site links.
    """
    site_delay = app.config['crawl']['site_delay']
    resource_delay = app.config['crawl']['resource_delay']
    async with pool.acquire() as conn:
        async with aiohttp.ClientSession() as session:
            fetcher = ResourceFetcher(session)
            while app.running:
                async for base_url, links_from in iter_site_queue(app, conn):
                    # get or create site
                    msg = f'Site queue: updating {base_url}'
                    logger.debug(msg)
                    site, created = await update_site(
                        app, fetcher, conn, base_url
                    )
                    if site:
                        await store_incoming_site_site_links(
                            conn, site.id_, links_from
                        )
                    # delete handled queue items
                    sql = "DELETE FROM site_queue WHERE url=$1"
                    await conn.execute(sql, base_url)
                    await app.sleep(resource_delay)
                logger.debug(
                    f'Queued sites exhausted, sleeping'
                    f' for {site_delay} seconds'
                )
                await app.sleep(site_delay)


async def iter_site_queue(
    app, conn: Connection
) -> AsyncIterator[tuple[str, dict[int, str]]]:
    """
    Yield URLs with aggregated link information from site_queue.

    Yield a URL and a dict mapping ids of linking sites to link texts.
    """
    site_revisit_interval = app.config['crawl']['site_revisit_interval']
    while app.running:
        sql = (
            "SELECT url, array_agg(src) srcs,"
            " array_agg(link_text) link_texts"
            " FROM site_queue GROUP BY url LIMIT 1"
        )
        row = await conn.fetchrow(sql)
        if row:
            base_url = row['url']
            links_from = {}
            srcs = row['srcs']
            link_texts = row['link_texts']
            for i in range(len(srcs)):
                if src := srcs[i]:
                    links_from[src] = link_texts[i]
            if site_id := await site_recently_updated(
                conn, base_url, site_revisit_interval
            ):
                # just store incoming links and remove the site from the queue
                await store_incoming_site_site_links(conn, site_id, links_from)
                sql = "DELETE FROM site_queue WHERE url=$1"
                await conn.execute(sql, base_url)
            else:
                yield base_url, links_from
        else:
            break


async def site_recently_updated(
    conn: Connection,
    base_url: str,
    site_revisit_interval: float,
) -> Optional[int]:
    """
    Return the id of the site with given base_url if it was updated recently.
    """
    sql = (
        f"SELECT id FROM site WHERE $1=any(base_urls)"
        f" AND last_update + interval '{site_revisit_interval} seconds'"
        f" > now() at time zone 'utc' LIMIT 1"
    )
    site_id = await conn.fetchval(sql, base_url)
    return site_id


async def store_incoming_site_site_links(
    conn: Connection, site_id: int, links_from: dict
):
    """
    Store incoming site-site links (irrespective of crawl_enabled).

    *site_id* is the id of the site to which the links in *links_from* point.
    """
    sql = (
        "INSERT INTO site_link"
        " (src, dst, link_text) VALUES ($1, $2, $3)"
        " ON CONFLICT (src, dst) DO NOTHING"
    )
    values = [
        (from_id, site_id, link_text)
        for from_id, link_text in links_from.items()
        if from_id != site_id
    ]
    await conn.executemany(sql, values)