atextcrawler/src/atextcrawler/resource/operations.py

"""
Operations on resources.
"""

import logging
from datetime import datetime
from typing import Optional, Sequence

from asyncpg import Connection

from ..models import (
    Feed,
    MetaResource,
    ResourceError,
    Site,
    Sitemap,
    SitemapIndex,
    SitePath,
    TextResource,
)
from ..search import delete_resource, index_resource
from ..tensorflow import TensorFlow
from ..utils.durl import Durl
from ..utils.similarity import (
    create_simhash,
    search_simhash,
    simhash_from_bigint,
    simhash_to_bigint,
)
from .feed import convert_feed_entries
from .fetch import ResourceFetcher
from .sitemap import extract_sitemap_paths

logger = logging.getLogger(__name__)


async def add_site_paths(
    conn: Connection,
    site_id: int,
    paths: Sequence[tuple[str, Optional[bool]]],
) -> None:
    """
    Add site paths. if resource infos are given, also create resources.

    The paths must be given as relative paths and together with a boolean
    telling whether the link is a canonical link.
    """
    sql = (
        "INSERT INTO site_path (site_id, path, canonical)"
        " VALUES ($1, $2, $3) ON CONFLICT (site_id, path) DO NOTHING"
    )
    values = (
        (site_id, path, canonical)
        for path, canonical in paths[:100000]
        if len(path) <= 400
    )
    await conn.executemany(sql, values)


async def update_resource_meta(
    conn: Connection,
    site_id: int,
    resource_meta: dict,
) -> None:
    """
    Update meta information of existing resources using path to find them.
    """
    sql = (
        "UPDATE resource SET last_change=coalesce($1, last_change),"
        " title=coalesce($2, title), summary=coalesce($3, summary) FROM ("
        " SELECT resource_id FROM site_path WHERE site_id=$4 AND path=$5"
        ") sp WHERE resource.id=sp.resource_id"
    )
    values = ((*meta, site_id, path) for path, meta in resource_meta.items())
    await conn.executemany(sql, values)


async def store_feed_entries(
    conn: Connection,
    site: Site,
    entries: list[dict],
) -> None:
    """
    Add missing resources of a site from given feed entries.
    """
    if site.id_:
        paths, resource_meta = convert_feed_entries(site.base_url, entries)
        await add_site_paths(conn, site.id_, paths)
        await update_resource_meta(conn, site.id_, resource_meta)


async def get_site_path(
    conn: Connection,
    site: Site,
    before: datetime,
    only_new=False,
) -> Optional[SitePath]:
    """
    Return the next path of a given site that needs to be processed.

    If none needs to be processed, return None.
    In particular, for sites having crawl_enabled=false return None.

    Only return paths that have last been visited before *before*
    or not been processed at all. Paths with an ok_count of -3 or lower
    are dropped.

    If *only_new*, limit to paths that have not been processed at all,
    irrespective of the value of *before*.
    """
    sql = "SELECT crawl_enabled FROM site WHERE id=$1"
    crawl_enabled = await conn.fetchval(sql, site.id_)
    if not crawl_enabled:
        return None
    if only_new:
        sql = (
            "SELECT * FROM site_path"
            " WHERE site_id=$1 AND last_visit is null LIMIT 1"
        )  # implicitly canonical=null
        row = await conn.fetchrow(sql, site.id_)
    else:
        sql = (
            "SELECT * FROM site_path"
            " WHERE site_id=$1 AND canonical IS NOT false AND"
            " (last_visit is null OR last_visit<$2) AND"
            " ok_count > -3 LIMIT 1"
        )  # canonical can be true or null
        row = await conn.fetchrow(sql, site.id_, before)
    if row:
        return await SitePath().load_from_row(row)
    return None


async def process_site_path(
    app,
    worker_number: int,
    conn: Connection,
    fetcher: ResourceFetcher,
    tf: TensorFlow,
    site: Site,
    site_path: SitePath,
) -> bool:
    """
    Fetch a path, deduplicate and if canonical, update and index the resource.

    Return whether a new resource was handled that should contribute to
    statistics.
    """
    msg = (
        f'Worker {worker_number} processing site {site.id_}'
        f' site_path {site_path.id_} {site.base_url}{site_path.path}'
    )
    logger.debug(msg)
    if not site.id_:  # only to satisfy typing
        return False

    # fetch url
    site_path.last_visit = datetime.utcnow()
    url = site_path.url(site)
    resource = await fetcher.fetch(url, site=site)

    # handle failure (possibly deleting old information)
    if not isinstance(resource, (TextResource, MetaResource)):
        if not resource:  # irrelevant content-type
            site_path.ok_count = -10
        elif isinstance(resource, ResourceError):
            site_path.ok_count -= 1
        if site_path.ok_count <= -3 and site_path.resource_id:
            await site_path.unlink_resource(
                conn,
                app.search_engine,
                app.config['elasticsearch']['index_base_name'],
            )
        await site_path.save(conn)
        if resource:  # relevant content-type
            msg = (
                f'Worker {worker_number} failed to process site_path'
                f' {site_path.id_} (site {site.id_},'
                f' {site.base_url}{site_path.path})'
            )
            logger.info(msg)
        return False

    # handle MetaResources
    if isinstance(resource, MetaResource):
        if isinstance(resource, Feed):
            resource.site_id = site.id_
            await resource.save(conn)
            if resource.entries:
                await store_feed_entries(conn, site, resource.entries)
        elif isinstance(resource, Sitemap):
            paths, _ = extract_sitemap_paths(site.base_url, resource.urls)
            await add_site_paths(conn, site.id_, paths)
        elif isinstance(resource, SitemapIndex):
            for sitemap_dict in resource.sitemaps:
                url = sitemap_dict['loc']
                res_sitemap = await fetcher.fetch(url, site=site)
                if isinstance(res_sitemap, Sitemap):
                    paths, _ = extract_sitemap_paths(
                        site.base_url, res_sitemap.urls
                    )
                    await add_site_paths(conn, site.id_, paths)
        resource_id = None
        is_new_resource = False
    else:  # handle TextResource
        resource_id, is_new_resource = await _handle_text_resource(
            app, conn, tf, site, site_path, resource, url
        )
        site_path.canonical = resource.init_fields.get('canonical')
        if shortlink_url := resource.init_fields.get('shortlink'):
            await _save_shortlink(
                conn,
                site,
                url,
                resource_id,
                shortlink_url,
                site_path.last_visit,
            )
    site_path.resource_id = resource_id
    site_path.ok_count += 1
    await site_path.save(conn)
    return is_new_resource


async def _handle_text_resource(
    app, conn, tf, site, site_path, resource, url
) -> tuple[Optional[int], bool]:
    """
    Ingest a text resource returning the id of the possibly merged resource.

    Return the id of the merged resource (or None if the incoming resource
    has a too short text and is not worth storing a resource) and
    whether the resource is new (False if the returned resource_id is None).
    """
    # save the resource's internal links
    paths = []
    if links_int := resource.init_fields['links_int']:
        for durl, (rel, _) in links_int.items():
            rp_filter = app.plugins['filter_resource_path'].rp_filter
            if path := rp_filter(site, durl):
                canon = (rel and rel.lower() == 'canonical') or None
                paths.append((path, canon))
        await add_site_paths(conn, site.id_, paths)

    # find resources similar to the current text
    text = resource.search_fields['text']

    # discard resources with too short texts
    if len(text) < app.config['crawl']['min_text_length']:
        await site_path.unlink_resource(
            conn,
            app.search_engine,
            app.config['elasticsearch']['index_base_name'],
        )
        await site_path.save(conn)
        return None, False

    simhash = simhash_from_bigint(resource.simhash)
    index = site.simhash_index
    similar_ids = search_simhash(index, simhash)
    print(similar_ids, site_path.resource_id)

    # determine the destination resource and resources to be merged into it
    old_id = site_path.resource_id
    if old_id and old_id in similar_ids:
        merge_ids = similar_ids
        dest_resource = await TextResource().load(conn, old_id)
    else:  # no old text, or old text not similar any more
        if old_id:
            await site_path.unlink_resource(
                conn,
                app.search_engine,
                app.config['elasticsearch']['index_base_name'],
            )
        # find the first existing similar resource
        for similar_id in similar_ids:
            dest_resource = await TextResource().load(conn, similar_id)
            if dest_resource:
                # also require similar length
                l1 = len(resource.search_fields['text'])
                l2 = dest_resource.text_len
                if 0.95 * l2 <= l1 <= 1.05 * l2:
                    merge_ids = list(
                        filter(lambda elem: elem != similar_id, similar_ids)
                    )
                    break
        else:
            dest_resource = None
            merge_ids = []

    # update or create the destination resource
    if dest_resource:
        is_new_resource = False
        resource.simhash = create_simhash(index, dest_resource.id_, simhash)
        await dest_resource.update_from_resource(resource)
        resource = dest_resource
    else:
        is_new_resource = True
        resource.simhash = simhash_to_bigint(simhash)
        await resource.save(conn)
        create_simhash(index, resource.id_, simhash)

    # add resource to search index
    await index_resource(
        app.search_engine,
        tf,
        site_path,
        resource,
        site.base_url,
        url,
    )

    # replace references to any merge resource with links to the dest resource
    sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=ANY($2)"
    await conn.execute(sql, resource.id_, merge_ids)

    # remove orphaned resources after merging
    sql = "DELETE FROM resource WHERE id=ANY($1) RETURNING (id, lang)"
    rows = await conn.fetch(sql, set(merge_ids) - set([resource.id_]))
    for row in rows:
        await delete_resource(
            app.search_engine,
            row['row'][1],
            row['row'][0],
        )

    return resource.id_, is_new_resource


async def _save_shortlink(
    conn, site, url, resource_id, shortlink_url, last_visit
):
    """
    Save a shortlink.
    """
    shortlink_durl = await Durl(shortlink_url, base=site.base_url)
    if shortlink_durl and shortlink_url != url:
        sql = "SELECT * FROM site_path WHERE site_id=$1 AND path=$2"
        sl_path = shortlink_durl.pwa()
        row = await conn.fetchrow(sql, site.id_, sl_path)
        shortlink = await SitePath().load_from_row(row)
        if not shortlink:
            shortlink = SitePath(
                site_id=site.id_,
                path=sl_path,
                last_visit=last_visit,
                ok_count=1,
                canonical=False,
                resource_id=resource_id,
            )
        else:
            shortlink.last_visit = last_visit
            shortlink.ok_count += 1
            shortlink.canonical = False
            shortlink.resource_id = resource_id
        await shortlink.save(conn)