atextcrawler/src/atextcrawler/models.py

"""
Data Models.
"""

import logging
from dataclasses import InitVar, asdict, dataclass, field, fields
from datetime import date, datetime
from itertools import chain
from typing import Any, ClassVar, Optional

import tldextract
from asyncpg import Connection

from .search import delete_resource
from .utils.durl import Durl, get_url_variants
from .utils.link import extract_domain
from .utils.similarity import get_simhash, simhash_to_bigint

logger = logging.getLogger(__name__)


class ModelBase:
    """
    Abstract base class for models.

    Execute SQL to load, save, delete instances using asyncpg.
    """

    table: ClassVar
    id_: Optional[int] = 0

    async def load(self, conn: Connection, id_: int) -> Optional[Any]:
        """
        If loading fails, return None.
        """
        sql = f"SELECT * FROM {self.table} WHERE id=$1"
        row = await conn.fetchrow(sql, id_)
        if not row:
            return None
        return await self.load_from_row(row)

    async def load_from_row(self, row):
        """
        If row is None, return None.
        """
        if not row:
            return None
        data = dict(row)
        self.id_ = data.pop('id')
        self.__init__(**data)
        return self

    async def save(self, conn: Connection) -> None:
        """
        Save the instance (update if self.id_ is set, else insert).
        """
        data = asdict(self)
        # logger.debug(f'Save {self}: id_={self.id_}')
        if self.id_:  # update
            cols = ', '.join(data.keys())
            upds = ', '.join(
                [f'{col}=${i + 1}' for i, col in enumerate(data.keys())]
            )
            val_id = f'${len(data) + 1}'
            sql = f"UPDATE {self.table} SET {upds} WHERE id={val_id}"
            await conn.execute(sql, *data.values(), self.id_)
        else:  # insert
            cols = ', '.join(data.keys())
            vals = ', '.join([f'${i + 1}' for i in range(len(data))])
            sql = (
                f"INSERT INTO {self.table} ({cols}) VALUES ({vals})"
                f" RETURNING id"
            )
            self.id_ = await conn.fetchval(sql, *data.values())

    def asdict(self):
        """
        Return instance data as dictionary.
        """
        return asdict(self)

    async def delete(self, conn: Connection) -> None:
        """
        Delete the object if it has an id_.
        """
        if self.id_:
            sql = f"DELETE FROM {self.table} WHERE id=$1"
            await conn.execute(sql, self.id_)


class ResourceError:
    """
    Error encountered while trying to fetch a resource.

    ResourceError is used for cases when fetching a resource fails.
    """

    def __init__(self, msg, status=None, headers=None):
        self.msg = msg
        self.status = status
        self.headers = headers

    def __repr__(self):
        return f'ResourceError: {self.msg}'


class ResourceRedirect:
    """
    A resource containing a redirect.
    """

    def __init__(self, urls):
        self.urls = urls


@dataclass
class TextResource(ModelBase):
    """
    TextResource (without path).

    TextResource models web resources with relevant text content.
    They are instantiated in modules page, document, ...; their metadata
    are stored in table `resource` and the text content is stored with the
    search engine.

    Do not confuse with SitePath: Several SitePath instances
    may point to a TextResource. The TextResource holds the actual content.

    If we are not dealing with the startpage of a new site,
    the init_fields dict usually will contain the site to which
    the resource belongs.
    """

    table: ClassVar = 'resource'
    init_fields: InitVar[dict] = None  # additional fields after fetching
    search_fields: InitVar[dict] = None  # additional fields for indexing

    # database fields
    simhash: Optional[int] = None
    content_type: Optional[str] = None
    last_change: Optional[datetime] = None
    text_len: int = 0
    lang: Optional[str] = None
    title: Optional[str] = None
    summary: Optional[str] = None

    def __post_init__(self, init_fields, search_fields):
        if init_fields is None:
            init_fields = {}
        self.init_fields = init_fields
        if search_fields is None:
            search_fields = {}
        self.search_fields = search_fields
        self.site = self.init_fields.get('site')
        self.site_id = self.site.id_ if self.site else None
        self._update_simhash()

    def __str__(self):
        return (
            f'TextResource(id={self.id_},'
            f' site_id={self.site_id},'
            f' type={self.content_type})'
        )

    def _update_simhash(self):
        """
        Update the simhash of the resource from its text content.
        """
        if self.simhash is None:
            text = self.search_fields.get('text', '')
            self.simhash = simhash_to_bigint(get_simhash(text))

    async def save(self, conn: Connection):
        """
        Save the instance, extending the parent's method.
        """
        self.content_type = (
            self.content_type[:50] if self.content_type else None
        )
        self.title = self.title[:200] if self.title else None
        self.summary = self.summary[:400] if self.summary else None
        self._update_simhash()
        if self.last_change is None:
            self.last_change = datetime.utcnow()
        await super().save(conn)

    async def update_from_resource(self, upd: 'TextResource'):
        """
        Update self with values from another resource.
        """
        names = [field.name for field in fields(self)]
        for name in names:
            cur_val = getattr(self, name)
            upd_val = getattr(upd, name)
            if not cur_val and upd_val is not None:
                setattr(self, name, upd_val)
        init_names = [
            'headers',
            'redirects',
            'links_int',
            'links_ext',
            'shortlinks',
            'canonical',
            #'head',
        ]
        self.init_fields = upd.init_fields
        self.search_fields = upd.search_fields
        # for init_name in init_names:
        #    cur_val = self.init_fields.get(init_name)
        #    upd_val = upd.init_fields.get(init_name)
        #    if not cur_val and upd_val is not None:
        #        self.init_fields[init_name] = upd_val


@dataclass
class MetaResource(ModelBase):
    """
    Parent class for Feed, Sitemap, SitemapIndex.

    MetaResource is a parent class for Feed, Sitemap, SitemapIndex.
    Their instances are not stored. Note: class Feed contains feed meta data
    and is stored in the database.
    """


@dataclass
class SitemapIndex(MetaResource):
    """
    A SitemapIndex meta resource.

    Just a list of the siteap URLs, nothing more.
    """

    sitemaps: list = field(default_factory=list)


@dataclass
class Sitemap(MetaResource):
    """
    A Sitemap meta resource.

    Just a list of the resulting links, nothing more.
    """

    urls: list = field(default_factory=list)


@dataclass
class Feed(MetaResource):
    """
    A site's feed (RSS, Atom , ...).
    """

    table: ClassVar = 'site_feed'
    entries: InitVar[list] = None
    site_id: Optional[int] = None
    url: Optional[str] = None
    etag: Optional[str] = None
    modified: Optional[str] = None
    t_visit: Optional[datetime] = None
    t_content: Optional[datetime] = None
    version: Optional[str] = None
    title: Optional[str] = None
    description: Optional[str] = None
    fail_count: int = 0

    def __post_init__(self, entries):
        self.entries = entries

    def __str__(self):
        return f'Feed(id={self.id_}, site_id={self.site_id}, url={self.url})'

    async def save(self, conn: Connection):
        """
        Save, trying to merge with existing entry matching on site_id and url.
        """
        if not self.site_id or not self.url:
            msg = f'Saving feed failed: missing site_id of url'
            logger.error(msg)
            return
        sql = "SELECT id FROM site_feed WHERE site_id=$1 AND url=$2"
        self.id_ = await conn.fetchval(sql, self.site_id, self.url)
        await super().save(conn)

    def debug(self) -> str:
        """
        Return the instance data asa string for debug print output.
        """
        return (
            f'Feed:\n'
            f'- id: {self.id_}\n'
            f'- site_id: {self.site_id}\n'
            f'- url: {self.url}\n'
            f'- etag: {self.etag}\n'
            f'- modified: {self.modified}\n'
            f'- t_visit: {self.t_visit}\n'
            f'- t_content: {self.t_content}\n'
            f'- version: {self.version}\n'
            f'- title: {self.title}\n'
            f'- description: {self.description}\n'
            f'- fail_count: {self.fail_count}\n'
            f'- entries: {self.entries}'
        )


@dataclass
class Site(ModelBase):
    """
    Website.
    """

    table: ClassVar = 'site'
    base_durl: InitVar[Durl] = None
    feeds: InitVar[dict] = None
    links_ext: InitVar[dict] = None
    links_int: InitVar[dict] = None
    startpage_text: InitVar[str] = None

    canonical_url: Optional[str] = None
    base_url: Optional[str] = None
    base_urls: list[str] = field(default_factory=list)
    domains: list[str] = field(default_factory=list)
    ips: Optional[list[str]] = None
    crawl_enabled: bool = False
    crawl_active: bool = False
    next_full_crawl: Optional[datetime] = None
    next_feed_crawl: Optional[datetime] = None
    last_update: Optional[datetime] = None
    last_pub: Optional[datetime] = None
    pub_dates: Optional[dict[str, str]] = None
    langs: list[str] = field(default_factory=list)
    alt_langs: dict[str, str] = field(default_factory=dict)
    title: Optional[str] = None
    description: Optional[str] = None
    keywords: list[str] = field(default_factory=list)
    linkbacks: dict[str, str] = field(default_factory=dict)
    meta_info: dict = field(default_factory=dict)
    boilerplate_texts: list[str] = field(default_factory=list)

    def __post_init__(
        self,
        base_durl: Durl,
        feeds=None,
        links_ext=None,
        links_int=None,
        startpage_text=None,
    ):
        self.feeds = feeds
        self.links_ext = links_ext
        self.links_int = links_int
        self.startpage_text = startpage_text
        self.keywords = self.keywords[:20]
        if not self.last_update:
            self.last_update = datetime.utcnow()
        pub_date: Optional[str]
        if self.last_pub:
            pub_date = date.isoformat(self.last_pub.date())
            self.pub_dates = {date.isoformat(self.last_update): pub_date}
        else:
            pub_date = None
            self.pub_dates = {}
        if base_durl:
            self.base_urls = [base_durl.url()[:200]]
            self.domains = [extract_domain(base_durl.hostname)[:100]]

    def __str__(self):
        return (
            f'Site(id={self.id_}, url={self.base_url},'
            f' crawl_enabled={self.crawl_enabled})'
        )

    async def update_base_url(self) -> None:
        """
        Update the base_url, choosing the most relevant URL.

        If canonical_url is not None, use this.
        Otherwise set self.base_url to the shortest from self.base_urls,
        but requiring a https-url if there is at least one.
        """
        if self.canonical_url and self.canonical_url not in self.base_urls:
            if canonical_durl := await Durl(self.canonical_url):
                self.base_urls.append(self.canonical_url)
                domain = extract_domain(canonical_durl.hostname)
                if domain not in self.domains:
                    self.domains.append(domain)
        if self.canonical_url:
            self.base_url = self.canonical_url
            return
        if not self.base_url:
            url_candidates = self.base_urls
            if https_urls := [
                url for url in self.base_urls if url.startswith('https://')
            ]:
                url_candidates = https_urls
            self.base_url = min(url_candidates, key=len)

    async def save(  # type: ignore
        self, conn, merge=True
    ) -> tuple[Optional[int], bool]:
        """
        Store the site, optionally trying to merge it with an existing site.

        Return the id of the saved instance and whether a new instance
        was created.

        If self.id_ is not 0, replace the data of the existing site with
        this id. Else if not merge, store as new row, and if merge,
        try to merge with an existing matching site.
        """
        await self.update_base_url()
        if not merge:
            created = not bool(self.id_)
            await super().save(conn)
            return self.id_, created
        if self.id_:
            sql = "SELECT base_urls, pub_dates FROM site WHERE id=$1"
            row = await conn.fetchrow(sql, self.id_)
            self.base_urls = list(
                set(row['base_urls']).union(set(self.base_urls))
            )
            if previous_pub_dates := row['pub_dates']:
                if not self.pub_dates:
                    self.pub_dates = {}
                self.pub_dates.update(previous_pub_dates)
            await super().save(conn)
            return self.id_, False
        same_site_id = await search_same_site(self, conn)
        if same_site_id:
            same_site = await Site().load(conn, same_site_id)
        if same_site_id and same_site:
            same_site.base_urls = set(same_site.base_urls).union(
                set(self.base_urls)
            )
            same_site.domains = set(same_site.domains).union(set(self.domains))
            if self.canonical_url and not same_site.canonical_url:
                same_site.canonical_url = self.canonical_url
            await same_site.save(conn, merge=False)  # call ourselves
            self.id_ = same_site.id_
            return self.id_, False
        else:
            await super().save(conn)
            return self.id_, True


@dataclass
class SitePath(ModelBase):
    """
    Path of a website. May point to a Resource.
    """

    table: ClassVar = 'site_path'
    site: InitVar[str] = None

    site_id: Optional[int] = None
    path: Optional[str] = None
    filtered: bool = False
    last_visit: Optional[datetime] = None
    ok_count: int = 0
    canonical: Optional[bool] = None
    resource_id: Optional[int] = None

    def __str__(self):
        return (
            f'SitePath(id={self.id_}, site_id={self.site_id},'
            f' path={self.path})'
        )

    async def save(self, conn: Connection):
        """
        Save the instance, extending the parent's method.
        """
        self.path = self.path[:400] if self.path else ''
        await super().save(conn)

    async def unlink_resource(self, conn, engine, index_base_name):
        """
        Unlink the resource and also delete it, if it has no more links.
        """
        if self.id_:
            if self.resource_id:
                sql = "SELECT COUNT(*) FROM site_path WHERE resource_id=$1"
                ref_count = await conn.fetchval(sql, self.resource_id)
                if ref_count == 0:
                    sql = (
                        "DELETE FROM resource WHERE id=$1"
                        " RETURNING (true, lang)"
                    )
                    found = await conn.fetchval(sql, self.resource_id)
                    if found:
                        await delete_resource(
                            engine, found[1], self.resource_id
                        )
                self.resource_id = None

    def url(self, site):
        """
        Return the full URL (combine the site's base_url with our path).
        """
        return site.base_url + self.path


@dataclass
class Crawl(ModelBase):
    """
    The crawl process of a website (begin, end, statistics, ...).
    """

    table: ClassVar = 'crawl'
    site_id: Optional[int] = None
    is_full: bool = False
    t_begin: datetime = datetime.utcnow()
    t_end: Optional[datetime] = None
    n_resources: int = 0
    n_resources_new: int = 0

    async def finish(self, conn, set_t_end):
        """
        Save the crawl. Set t_end only if indicated.
        """
        if set_t_end:
            self.t_end = datetime.utcnow()
        await self.save(conn)


async def search_same_site(
    site: Site,
    conn: Connection,
) -> Optional[int]:
    """
        Try to find a matching site for the given *site* and return its id.

    TODO: if the path is non-trivial, require it also for the matching site

        Two sites match when they return the same content for identical paths.
        The base_url (scheme and/or netloc) may differ.
        We do not have the content for all paths of both websites, so we need
        to estimate: We only take into account meta information from the
        start pages of both sites, in particular the title, description
        and information obtained the base_urls:

        We use a combination of these conditions:

          1. one of the sites has a canonical URL which matches the
             URL of the other site
          2. the content fields (title, description) have sufficient information
          3. the content fields match exactly
          4. the domain matches
          5. the domain matches, except for the TLD
          6. the base_urls differ in their schemes (http vs. https)
          7. the hostnames in the base_urls are identical
          8. the hostnames in the base_urls differ by a prepended 'www.'
          9. the IPs have at least one common address

        The algorithm is this (first answer is final, yes means match):

          * if (1) : yes
          * if (2), (3), (4) : yes
          * if (2), (3), (5), (9) : yes
          * if (6), ((7) or (8)) : yes
          * no
    """
    # rule (1)
    if site.canonical_url:
        sql = "SELECT id FROM site WHERE $1=ANY(base_urls) LIMIT 1"
        id_ = await conn.fetchval(sql, site.canonical_url)
        if id_:
            return id_
    else:
        sql = "SELECT id FROM site WHERE canonical_url=ANY($1) LIMIT 1"
        id_ = await conn.fetchval(sql, site.base_urls)
        if id_:
            return id_

    # rule (6), ((7) or (8))
    url_variants = set(
        chain.from_iterable(
            get_url_variants(base_url) for base_url in site.base_urls
        )
    )
    sql = f"SELECT id FROM site WHERE base_urls && $1 LIMIT 1"
    if id_ := await conn.fetchval(sql, url_variants):
        return id_

    # condition (2)
    if len(site.title or '') > 15 or len(site.description or '') > 15:
        sql = (
            f"SELECT * FROM site WHERE"
            f" COALESCE(title, '')=$1 AND COALESCE(description, '')=$2"
        )
        rows = await conn.fetch(sql, site.title or '', site.description or '')
        # condition (3)
        if rows:
            # condition (4)
            for row in rows:
                domains = set(row.get('domains', []))
                if domains & set(site.domains):
                    return row['id']
            # condition (9)
            for row in rows:
                ips = set(row.get('ips', []))
                if site.ips and ips & set(site.ips):
                    # condition (5)
                    domains_ = row.get('domains', [])
                    d1 = set([tldextract.extract(d).domain for d in domains_])
                    domains_ = site.domains or []
                    d2 = set([tldextract.extract(d).domain for d in domains_])
                    if d1 & d2:
                        return row['id']

    return None