atextcrawler/src/atextcrawler/models.py

611 lines
19 KiB
Python

"""
Data Models.
"""
import logging
from dataclasses import InitVar, asdict, dataclass, field, fields
from datetime import date, datetime
from itertools import chain
from typing import Any, ClassVar, Optional
import tldextract
from asyncpg import Connection
from .search import delete_resource
from .utils.durl import Durl, get_url_variants
from .utils.link import extract_domain
from .utils.similarity import get_simhash, simhash_to_bigint
logger = logging.getLogger(__name__)
class ModelBase:
"""
Abstract base class for models.
Execute SQL to load, save, delete instances using asyncpg.
"""
table: ClassVar
id_: Optional[int] = 0
async def load(self, conn: Connection, id_: int) -> Optional[Any]:
"""
If loading fails, return None.
"""
sql = f"SELECT * FROM {self.table} WHERE id=$1"
row = await conn.fetchrow(sql, id_)
if not row:
return None
return await self.load_from_row(row)
async def load_from_row(self, row):
"""
If row is None, return None.
"""
if not row:
return None
data = dict(row)
self.id_ = data.pop('id')
self.__init__(**data)
return self
async def save(self, conn: Connection) -> None:
"""
Save the instance (update if self.id_ is set, else insert).
"""
data = asdict(self)
# logger.debug(f'Save {self}: id_={self.id_}')
if self.id_: # update
cols = ', '.join(data.keys())
upds = ', '.join(
[f'{col}=${i + 1}' for i, col in enumerate(data.keys())]
)
val_id = f'${len(data) + 1}'
sql = f"UPDATE {self.table} SET {upds} WHERE id={val_id}"
await conn.execute(sql, *data.values(), self.id_)
else: # insert
cols = ', '.join(data.keys())
vals = ', '.join([f'${i + 1}' for i in range(len(data))])
sql = (
f"INSERT INTO {self.table} ({cols}) VALUES ({vals})"
f" RETURNING id"
)
self.id_ = await conn.fetchval(sql, *data.values())
def asdict(self):
"""
Return instance data as dictionary.
"""
return asdict(self)
async def delete(self, conn: Connection) -> None:
"""
Delete the object if it has an id_.
"""
if self.id_:
sql = f"DELETE FROM {self.table} WHERE id=$1"
await conn.execute(sql, self.id_)
class ResourceError:
"""
Error encountered while trying to fetch a resource.
ResourceError is used for cases when fetching a resource fails.
"""
def __init__(self, msg, status=None, headers=None):
self.msg = msg
self.status = status
self.headers = headers
def __repr__(self):
return f'ResourceError: {self.msg}'
class ResourceRedirect:
"""
A resource containing a redirect.
"""
def __init__(self, urls):
self.urls = urls
@dataclass
class TextResource(ModelBase):
"""
TextResource (without path).
TextResource models web resources with relevant text content.
They are instantiated in modules page, document, ...; their metadata
are stored in table `resource` and the text content is stored with the
search engine.
Do not confuse with SitePath: Several SitePath instances
may point to a TextResource. The TextResource holds the actual content.
If we are not dealing with the startpage of a new site,
the init_fields dict usually will contain the site to which
the resource belongs.
"""
table: ClassVar = 'resource'
init_fields: InitVar[dict] = None # additional fields after fetching
search_fields: InitVar[dict] = None # additional fields for indexing
# database fields
simhash: Optional[int] = None
content_type: Optional[str] = None
last_change: Optional[datetime] = None
text_len: int = 0
lang: Optional[str] = None
title: Optional[str] = None
summary: Optional[str] = None
def __post_init__(self, init_fields, search_fields):
if init_fields is None:
init_fields = {}
self.init_fields = init_fields
if search_fields is None:
search_fields = {}
self.search_fields = search_fields
self.site = self.init_fields.get('site')
self.site_id = self.site.id_ if self.site else None
self._update_simhash()
def __str__(self):
return (
f'TextResource(id={self.id_},'
f' site_id={self.site_id},'
f' type={self.content_type})'
)
def _update_simhash(self):
"""
Update the simhash of the resource from its text content.
"""
if self.simhash is None:
text = self.search_fields.get('text', '')
self.simhash = simhash_to_bigint(get_simhash(text))
async def save(self, conn: Connection):
"""
Save the instance, extending the parent's method.
"""
self.content_type = (
self.content_type[:50] if self.content_type else None
)
self.title = self.title[:200] if self.title else None
self.summary = self.summary[:400] if self.summary else None
self._update_simhash()
if self.last_change is None:
self.last_change = datetime.utcnow()
await super().save(conn)
async def update_from_resource(self, upd: 'TextResource'):
"""
Update self with values from another resource.
"""
names = [field.name for field in fields(self)]
for name in names:
cur_val = getattr(self, name)
upd_val = getattr(upd, name)
if not cur_val and upd_val is not None:
setattr(self, name, upd_val)
init_names = [
'headers',
'redirects',
'links_int',
'links_ext',
'shortlinks',
'canonical',
#'head',
]
self.init_fields = upd.init_fields
self.search_fields = upd.search_fields
# for init_name in init_names:
# cur_val = self.init_fields.get(init_name)
# upd_val = upd.init_fields.get(init_name)
# if not cur_val and upd_val is not None:
# self.init_fields[init_name] = upd_val
@dataclass
class MetaResource(ModelBase):
"""
Parent class for Feed, Sitemap, SitemapIndex.
MetaResource is a parent class for Feed, Sitemap, SitemapIndex.
Their instances are not stored. Note: class Feed contains feed meta data
and is stored in the database.
"""
@dataclass
class SitemapIndex(MetaResource):
"""
A SitemapIndex meta resource.
Just a list of the siteap URLs, nothing more.
"""
sitemaps: list = field(default_factory=list)
@dataclass
class Sitemap(MetaResource):
"""
A Sitemap meta resource.
Just a list of the resulting links, nothing more.
"""
urls: list = field(default_factory=list)
@dataclass
class Feed(MetaResource):
"""
A site's feed (RSS, Atom , ...).
"""
table: ClassVar = 'site_feed'
entries: InitVar[list] = None
site_id: Optional[int] = None
url: Optional[str] = None
etag: Optional[str] = None
modified: Optional[str] = None
t_visit: Optional[datetime] = None
t_content: Optional[datetime] = None
version: Optional[str] = None
title: Optional[str] = None
description: Optional[str] = None
fail_count: int = 0
def __post_init__(self, entries):
self.entries = entries
def __str__(self):
return f'Feed(id={self.id_}, site_id={self.site_id}, url={self.url})'
async def save(self, conn: Connection):
"""
Save, trying to merge with existing entry matching on site_id and url.
"""
if not self.site_id or not self.url:
msg = f'Saving feed failed: missing site_id of url'
logger.error(msg)
return
sql = "SELECT id FROM site_feed WHERE site_id=$1 AND url=$2"
self.id_ = await conn.fetchval(sql, self.site_id, self.url)
await super().save(conn)
def debug(self) -> str:
"""
Return the instance data asa string for debug print output.
"""
return (
f'Feed:\n'
f'- id: {self.id_}\n'
f'- site_id: {self.site_id}\n'
f'- url: {self.url}\n'
f'- etag: {self.etag}\n'
f'- modified: {self.modified}\n'
f'- t_visit: {self.t_visit}\n'
f'- t_content: {self.t_content}\n'
f'- version: {self.version}\n'
f'- title: {self.title}\n'
f'- description: {self.description}\n'
f'- fail_count: {self.fail_count}\n'
f'- entries: {self.entries}'
)
@dataclass
class Site(ModelBase):
"""
Website.
"""
table: ClassVar = 'site'
base_durl: InitVar[Durl] = None
feeds: InitVar[dict] = None
links_ext: InitVar[dict] = None
links_int: InitVar[dict] = None
startpage_text: InitVar[str] = None
canonical_url: Optional[str] = None
base_url: Optional[str] = None
base_urls: list[str] = field(default_factory=list)
domains: list[str] = field(default_factory=list)
ips: Optional[list[str]] = None
crawl_enabled: bool = False
crawl_active: bool = False
next_full_crawl: Optional[datetime] = None
next_feed_crawl: Optional[datetime] = None
last_update: Optional[datetime] = None
last_pub: Optional[datetime] = None
pub_dates: Optional[dict[str, str]] = None
langs: list[str] = field(default_factory=list)
alt_langs: dict[str, str] = field(default_factory=dict)
title: Optional[str] = None
description: Optional[str] = None
keywords: list[str] = field(default_factory=list)
linkbacks: dict[str, str] = field(default_factory=dict)
meta_info: dict = field(default_factory=dict)
boilerplate_texts: list[str] = field(default_factory=list)
def __post_init__(
self,
base_durl: Durl,
feeds=None,
links_ext=None,
links_int=None,
startpage_text=None,
):
self.feeds = feeds
self.links_ext = links_ext
self.links_int = links_int
self.startpage_text = startpage_text
self.keywords = self.keywords[:20]
if not self.last_update:
self.last_update = datetime.utcnow()
pub_date: Optional[str]
if self.last_pub:
pub_date = date.isoformat(self.last_pub.date())
self.pub_dates = {date.isoformat(self.last_update): pub_date}
else:
pub_date = None
self.pub_dates = {}
if base_durl:
self.base_urls = [base_durl.url()[:200]]
self.domains = [extract_domain(base_durl.hostname)[:100]]
def __str__(self):
return (
f'Site(id={self.id_}, url={self.base_url},'
f' crawl_enabled={self.crawl_enabled})'
)
async def update_base_url(self) -> None:
"""
Update the base_url, choosing the most relevant URL.
If canonical_url is not None, use this.
Otherwise set self.base_url to the shortest from self.base_urls,
but requiring a https-url if there is at least one.
"""
if self.canonical_url and self.canonical_url not in self.base_urls:
if canonical_durl := await Durl(self.canonical_url):
self.base_urls.append(self.canonical_url)
domain = extract_domain(canonical_durl.hostname)
if domain not in self.domains:
self.domains.append(domain)
if self.canonical_url:
self.base_url = self.canonical_url
return
if not self.base_url:
url_candidates = self.base_urls
if https_urls := [
url for url in self.base_urls if url.startswith('https://')
]:
url_candidates = https_urls
self.base_url = min(url_candidates, key=len)
async def save( # type: ignore
self, conn, merge=True
) -> tuple[Optional[int], bool]:
"""
Store the site, optionally trying to merge it with an existing site.
Return the id of the saved instance and whether a new instance
was created.
If self.id_ is not 0, replace the data of the existing site with
this id. Else if not merge, store as new row, and if merge,
try to merge with an existing matching site.
"""
await self.update_base_url()
if not merge:
created = not bool(self.id_)
await super().save(conn)
return self.id_, created
if self.id_:
sql = "SELECT base_urls, pub_dates FROM site WHERE id=$1"
row = await conn.fetchrow(sql, self.id_)
self.base_urls = list(
set(row['base_urls']).union(set(self.base_urls))
)
if previous_pub_dates := row['pub_dates']:
if not self.pub_dates:
self.pub_dates = {}
self.pub_dates.update(previous_pub_dates)
await super().save(conn)
return self.id_, False
same_site_id = await search_same_site(self, conn)
if same_site_id:
same_site = await Site().load(conn, same_site_id)
if same_site_id and same_site:
same_site.base_urls = set(same_site.base_urls).union(
set(self.base_urls)
)
same_site.domains = set(same_site.domains).union(set(self.domains))
if self.canonical_url and not same_site.canonical_url:
same_site.canonical_url = self.canonical_url
await same_site.save(conn, merge=False) # call ourselves
self.id_ = same_site.id_
return self.id_, False
else:
await super().save(conn)
return self.id_, True
@dataclass
class SitePath(ModelBase):
"""
Path of a website. May point to a Resource.
"""
table: ClassVar = 'site_path'
site: InitVar[str] = None
site_id: Optional[int] = None
path: Optional[str] = None
filtered: bool = False
last_visit: Optional[datetime] = None
ok_count: int = 0
canonical: Optional[bool] = None
resource_id: Optional[int] = None
def __str__(self):
return (
f'SitePath(id={self.id_}, site_id={self.site_id},'
f' path={self.path})'
)
async def save(self, conn: Connection):
"""
Save the instance, extending the parent's method.
"""
self.path = self.path[:400] if self.path else ''
await super().save(conn)
async def unlink_resource(self, conn, engine, index_base_name):
"""
Unlink the resource and also delete it, if it has no more links.
"""
if self.id_:
if self.resource_id:
sql = "SELECT COUNT(*) FROM site_path WHERE resource_id=$1"
ref_count = await conn.fetchval(sql, self.resource_id)
if ref_count == 0:
sql = (
"DELETE FROM resource WHERE id=$1"
" RETURNING (true, lang)"
)
found = await conn.fetchval(sql, self.resource_id)
if found:
await delete_resource(
engine, found[1], self.resource_id
)
self.resource_id = None
def url(self, site):
"""
Return the full URL (combine the site's base_url with our path).
"""
return site.base_url + self.path
@dataclass
class Crawl(ModelBase):
"""
The crawl process of a website (begin, end, statistics, ...).
"""
table: ClassVar = 'crawl'
site_id: Optional[int] = None
is_full: bool = False
t_begin: datetime = datetime.utcnow()
t_end: Optional[datetime] = None
n_resources: int = 0
n_resources_new: int = 0
async def finish(self, conn, set_t_end):
"""
Save the crawl. Set t_end only if indicated.
"""
if set_t_end:
self.t_end = datetime.utcnow()
await self.save(conn)
async def search_same_site(
site: Site,
conn: Connection,
) -> Optional[int]:
"""
Try to find a matching site for the given *site* and return its id.
TODO: if the path is non-trivial, require it also for the matching site
Two sites match when they return the same content for identical paths.
The base_url (scheme and/or netloc) may differ.
We do not have the content for all paths of both websites, so we need
to estimate: We only take into account meta information from the
start pages of both sites, in particular the title, description
and information obtained the base_urls:
We use a combination of these conditions:
1. one of the sites has a canonical URL which matches the
URL of the other site
2. the content fields (title, description) have sufficient information
3. the content fields match exactly
4. the domain matches
5. the domain matches, except for the TLD
6. the base_urls differ in their schemes (http vs. https)
7. the hostnames in the base_urls are identical
8. the hostnames in the base_urls differ by a prepended 'www.'
9. the IPs have at least one common address
The algorithm is this (first answer is final, yes means match):
* if (1) : yes
* if (2), (3), (4) : yes
* if (2), (3), (5), (9) : yes
* if (6), ((7) or (8)) : yes
* no
"""
# rule (1)
if site.canonical_url:
sql = "SELECT id FROM site WHERE $1=ANY(base_urls) LIMIT 1"
id_ = await conn.fetchval(sql, site.canonical_url)
if id_:
return id_
else:
sql = "SELECT id FROM site WHERE canonical_url=ANY($1) LIMIT 1"
id_ = await conn.fetchval(sql, site.base_urls)
if id_:
return id_
# rule (6), ((7) or (8))
url_variants = set(
chain.from_iterable(
get_url_variants(base_url) for base_url in site.base_urls
)
)
sql = f"SELECT id FROM site WHERE base_urls && $1 LIMIT 1"
if id_ := await conn.fetchval(sql, url_variants):
return id_
# condition (2)
if len(site.title or '') > 15 or len(site.description or '') > 15:
sql = (
f"SELECT * FROM site WHERE"
f" COALESCE(title, '')=$1 AND COALESCE(description, '')=$2"
)
rows = await conn.fetch(sql, site.title or '', site.description or '')
# condition (3)
if rows:
# condition (4)
for row in rows:
domains = set(row.get('domains', []))
if domains & set(site.domains):
return row['id']
# condition (9)
for row in rows:
ips = set(row.get('ips', []))
if site.ips and ips & set(site.ips):
# condition (5)
domains_ = row.get('domains', [])
d1 = set([tldextract.extract(d).domain for d in domains_])
domains_ = site.domains or []
d2 = set([tldextract.extract(d).domain for d in domains_])
if d1 & d2:
return row['id']
return None