atextcrawler/src/atextcrawler/site/operations.py

"""
Operations on sites.
"""

import logging
from datetime import datetime, timedelta
from typing import Optional

from asyncpg import Connection

from ..models import Crawl, Site, TextResource
from ..resource import (
    add_site_paths,
    extract_sitemap_paths,
    get_sitemap_urls,
    store_boilerplate_texts,
)
from ..utils.durl import Durl
from ..utils.similarity import get_simhash_index
from .feeds import fetch_feeds, store_new_feeds
from .parse import parse_startpage
from .robots import RobotsInfo

logger = logging.getLogger(__name__)


async def checkout_site(
    config, conn: Connection
) -> tuple[Optional[int], bool, bool]:
    """
    Get the id of a site to be crawled and mark it with crawl_active=true.

    Also return whether the site shall be fully crawled; if not, this
    means that just the resources from the feeds shall be crawled.

    Also return whether more sites might be available.
    """
    async with conn.transaction():
        sql = (
            "SELECT id, next_full_crawl < now() at time zone 'UTC' is_full"
            " FROM site WHERE crawl_enabled AND crawl_active = false"
            " AND (next_full_crawl < now() at time zone 'UTC'"
            " OR next_feed_crawl < now() at time zone 'UTC')"
            " LIMIT 1 FOR UPDATE SKIP LOCKED"
        )
        row = await conn.fetchrow(sql)
        if row:
            site_id = row['id']
            is_full = row['is_full']
            sql = "UPDATE site SET crawl_active = true WHERE id=$1"
            await conn.execute(sql, site_id)
            site = await Site().load(conn, site_id)
            if site:
                site.base_durl = await Durl(site.base_url)
                if site.base_durl:
                    site.simhash_index = await get_simhash_index(conn, site_id)
                    return site, is_full, True
                else:
                    # site not available; schedule next crawl
                    int_full = config['crawl']['full_crawl_interval']
                    int_feed = config['crawl']['feed_crawl_interval']
                    now = datetime.utcnow()
                    t_full = now + timedelta(seconds=int_full)
                    t_feed = now + timedelta(seconds=int_full + int_feed)
                    sql = (
                        "UPDATE site SET crawl_active=false,"
                        " next_full_crawl=$1, next_feed_crawl=$2"
                        " WHERE id=$3"
                    )
                    await conn.execute(sql, t_full, t_feed, site_id)
                    return None, False, True
            return None, False, True
    return None, False, False


async def update_site(
    app, fetcher, conn: Connection, base_url, site: Site = None
) -> tuple[Optional[Site], bool]:
    """
    Try to fetch base_url and return a site and whether a new one was created.

    This function is run for all sites (including blacklisted and irrelevant
    ones). It determines whether the site shall be crawled.

    If an errors occurs, return (None, False), and if a site was given,
    also set it to crawl_enabled=False and remove crawling schedules.

    If base_url could be fetched, update the site, possibly creating
    a new one.

    If the site has crawl_enabled, and no full crawl is scheduled,
    schedule one (by updating column `next_full_crawl`).
    """
    # fetch startpage
    logger.info(f'Updating site={site}, base_url={base_url}')
    resource = await fetcher.fetch(base_url, site=site)
    if (
        not isinstance(resource, TextResource)
        or resource.content_type != 'html'
    ):
        if site:
            site.meta_info['error'] = 'Invalid start page'
            site.crawl_enabled = False
            site.next_full_crawl = None
            site.next_feed_crawl = None
            await site.save(conn)
        logger.info(f'Failed startpage {base_url}: {resource}')
        return None, False

    # parse startpage (extract site information) and save the site
    site = await parse_startpage(resource, app=app, site=site)
    site_id, created = await site.save(conn)
    if created:
        logger.debug(f'Created {site}')

    # add black-/white-listing info
    is_allowed = await is_site_allowed(conn, site.id_, base_url)
    if is_allowed is not None and is_allowed != site.crawl_enabled:
        site.crawl_enabled = is_allowed
        await site.save(conn)

    # schedule full crawl, if none is scheduled and the site shall be crawled
    if site.crawl_enabled:
        sql = (
            "UPDATE site"
            " SET next_full_crawl=now() at time zone 'UTC'"
            " WHERE id=$1 AND next_full_crawl IS null"
        )
        await conn.execute(sql, site_id)

    return site, created


async def is_site_allowed(
    conn: Connection,
    site_id: Optional[int],
    base_url: str,
) -> Optional[bool]:
    """
    Return True if the site is whitelisted, False if blacklisted, else None.

    Also add missing site_ids to the annotations.
    """
    sql = "SELECT * FROM site_annotation WHERE site_id=$1 OR base_url=$2"
    anns = await conn.fetch(sql, site_id, base_url)
    for ann in anns:
        if ann['ann_type'] == 'blacklist':
            return False
        if ann['ann_type'] == 'whitelist':
            return True
    # add missing site_ids
    if site_id and any([ann['site_id'] is None for ann in anns]):
        sql = "UPDATE site_annotation SET site_id=$1 WHERE base_url=$2"
        await conn.execute(sql, site_id, base_url)
    return None


async def process_site(fetcher, conn: Connection, site: Site):
    """
    Process a site: fetch and store more information.

    Store external and internal links; find boilerplate texts;
    fetch sitemaps; fetch feeds; update date of last publication.
    """
    if not site.id_:  # only to satisfy typing
        return
    if site.links_ext:
        await _store_cross_site_links(conn, site.id_, site.links_ext)
    if site.links_int:
        paths = []
        for durl, (rel, _) in site.links_int.items():
            canon = (rel and rel.lower() == 'canonical') or None
            paths.append((durl.pwa(), canon))
        await add_site_paths(conn, site.id_, paths)

    await store_boilerplate_texts(fetcher, conn, site)

    # get sitemaps and add their resources
    robots = await RobotsInfo(site.base_url)  # type: ignore
    urls = await get_sitemap_urls(
        fetcher, site.base_url, sitemaps=robots.site_maps
    )
    paths_, latest = extract_sitemap_paths(site.base_url, urls)
    await add_site_paths(conn, site.id_, paths_)

    # store feeds and their resources
    await store_new_feeds(conn, site.id_, site.feeds)
    latest_ = await fetch_feeds(fetcher, conn, site)
    if latest_:
        latest = max(latest or latest_, latest_)

    # update last_pub
    if latest:
        site.last_pub = latest
    await site.save(conn)


async def checkin_site(app, conn: Connection, site: Site, crawl: Crawl):
    """
    Unlock the site and schedule next crawl.

    *crawl* is the crawl that has just finished (regularly or stopped).

    If the crawl was stopped (t_end is None), just unlock the site.

    Otherwise schedule a crawl of the same type. After a full crawl
    also a feed crawl is scheduled, if there was none scheduled.
    """
    if crawl.t_end is None:
        sql = "UPDATE site SET crawl_active=false WHERE id=$1"
        await conn.execute(sql, site.id_)
    elif crawl.is_full:
        full_interval = app.config['crawl']['full_crawl_interval']
        feed_interval = app.config['crawl']['feed_crawl_interval']
        next_full_crawl = crawl.t_begin + timedelta(seconds=full_interval)
        next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
        sql = (
            "UPDATE site SET crawl_active=false, next_full_crawl=$1,"
            " next_feed_crawl=coalesce(next_feed_crawl, $2) WHERE id=$3"
        )
        await conn.execute(sql, next_full_crawl, next_feed_crawl, site.id_)
    else:
        feed_interval = app.config['crawl']['feed_crawl_interval']
        next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
        sql = (
            "UPDATE site SET crawl_active=false, next_feed_crawl=$1"
            " WHERE id=$2"
        )
        await conn.execute(sql, next_feed_crawl, site.id_)


async def _store_cross_site_links(
    conn: Connection,
    site_id: int,
    links: dict[Durl, tuple[list[str], str]],
) -> None:
    """
    Put outgoing links into site_link/site_queue for existing/unknown sites.

    Separate outgoing links from *site_id* into two classes:
    (a) existing sites (rows in table site) and (b) unknown links.
    Add links from class (a) to table site_link.
    Add links from class (b) to table site_queue.
    """
    # add outgoing cross-site links for existing sites to table site_link
    urls = [url.site() for url in links.keys()]
    values = []
    sql = "SELECT id, unnest(base_urls) url FROM site WHERE base_urls && $1"
    if rows := await conn.fetch(sql, urls):
        for row in rows:
            if (durl := await Durl(row['url'])) in links.keys():
                _, link_text = links.pop(durl)
                if site_id != row['id']:
                    values.append((site_id, row['id'], link_text))
    sql = (
        "INSERT INTO site_link (src, dst, link_text)"
        " VALUES ($1, $2, $3) ON CONFLICT (src, dst) DO NOTHING"
    )
    await conn.executemany(sql, values)

    # add outgoing cross-site links for unknown sites to table site_queue
    sql = "INSERT INTO site_queue (src, url, link_text) VALUES ($1, $2, $3)"
    values = [
        (site_id, durl.site()[:200], link_text[:100])
        for durl, (_, link_text) in links.items()
    ]
    await conn.executemany(sql, values)