""" Operations on sites. """ import logging from datetime import datetime, timedelta from typing import Optional from asyncpg import Connection from ..models import Crawl, Site, TextResource from ..resource import ( add_site_paths, extract_sitemap_paths, get_sitemap_urls, store_boilerplate_texts, ) from ..utils.durl import Durl from ..utils.similarity import get_simhash_index from .feeds import fetch_feeds, store_new_feeds from .parse import parse_startpage from .robots import RobotsInfo logger = logging.getLogger(__name__) async def checkout_site( app, conn: Connection ) -> tuple[Optional[int], bool, bool]: """ Get the id of a site to be crawled and mark it with crawl_active=true. Also return whether the site shall be fully crawled; if not, this means that just the resources from the feeds shall be crawled. Also return whether more sites might be available. """ async with conn.transaction(): sql = ( "SELECT id, next_full_crawl < now() at time zone 'UTC' is_full" " FROM site WHERE crawl_enabled AND crawl_active = false" " AND (next_full_crawl < now() at time zone 'UTC'" " OR next_feed_crawl < now() at time zone 'UTC')" " LIMIT 1 FOR UPDATE SKIP LOCKED" ) row = await conn.fetchrow(sql) if row: site_id = row['id'] is_full = row['is_full'] sql = "UPDATE site SET crawl_active = true WHERE id=$1" await conn.execute(sql, site_id) site = await Site().load(conn, site_id) if site: site.base_durl = await Durl(site.base_url) if site.base_durl: site.simhash_index = await get_simhash_index(conn, site_id) return site, is_full, True else: # site not available; schedule next crawl int_full = app.config['crawl']['full_crawl_interval'] int_feed = app.config['crawl']['feed_crawl_interval'] now = datetime.utcnow() t_full = now + timedelta(seconds=int_full) t_feed = now + timedelta(seconds=int_full + int_feed) sql = ( "UPDATE site SET crawl_active=false," " next_full_crawl=$1, next_feed_crawl=$2" " WHERE id=$3" ) await conn.execute(sql, t_full, t_feed, site_id) return None, False, True return None, False, True return None, False, False async def update_site( app, fetcher, conn: Connection, base_url, site: Site = None ) -> tuple[Optional[Site], bool]: """ Try to fetch base_url and return a site and whether a new one was created. This function is run for all sites (including blacklisted and irrelevant ones). It determines whether the site shall be crawled. If an errors occurs, return (None, False), and if a site was given, also set it to crawl_enabled=False and remove crawling schedules. If base_url could be fetched, update the site, possibly creating a new one. If the site has crawl_enabled, and no full crawl is scheduled, schedule one (by updating column `next_full_crawl`). """ # fetch startpage logger.info(f'Updating site={site}, base_url={base_url}') resource = await fetcher.fetch(base_url, site=site) if ( not isinstance(resource, TextResource) or resource.content_type != 'html' ): if site: site.meta_info['error'] = 'Invalid start page' site.crawl_enabled = False site.next_full_crawl = None site.next_feed_crawl = None await site.save(conn) logger.info(f'Failed startpage {base_url}: {resource}') return None, False # parse startpage (extract site information) and save the site site = await parse_startpage(resource, app=app, site=site) site_id, created = await site.save(conn) if created: logger.debug(f'Created {site}') # add black-/white-listing info is_allowed = await is_site_allowed(conn, site.id_, base_url) if is_allowed is not None and is_allowed != site.crawl_enabled: site.crawl_enabled = is_allowed await site.save(conn) # schedule full crawl, if none is scheduled and the site shall be crawled if site.crawl_enabled: sql = ( "UPDATE site" " SET next_full_crawl=now() at time zone 'UTC'" " WHERE id=$1 AND next_full_crawl IS null" ) await conn.execute(sql, site_id) return site, created async def is_site_allowed( conn: Connection, site_id: Optional[int], base_url: str, ) -> Optional[bool]: """ Return True if the site is whitelisted, False if blacklisted, else None. Also add missing site_ids to the annotations. """ sql = "SELECT * FROM site_annotation WHERE site_id=$1 OR base_url=$2" anns = await conn.fetch(sql, site_id, base_url) for ann in anns: if ann['ann_type'] == 'blacklist': return False if ann['ann_type'] == 'whitelist': return True # add missing site_ids if site_id and any([ann['site_id'] is None for ann in anns]): sql = "UPDATE site_annotation SET site_id=$1 WHERE base_url=$2" await conn.execute(sql, site_id, base_url) return None async def process_site(fetcher, conn: Connection, site: Site): """ Process a site: fetch and store more information. Store external and internal links; find boilerplate texts; fetch sitemaps; fetch feeds; update date of last publication. """ if not site.id_: # only to satisfy typing return if site.links_ext: await _store_cross_site_links(conn, site.id_, site.links_ext) if site.links_int: paths = [] for durl, (rel, _) in site.links_int.items(): canon = (rel and rel.lower() == 'canonical') or None paths.append((durl.pwa(), canon)) await add_site_paths(conn, site.id_, paths) await store_boilerplate_texts(fetcher, conn, site) # get sitemaps and add their resources robots = await RobotsInfo(site.base_url) # type: ignore urls = await get_sitemap_urls( fetcher, site.base_url, sitemaps=robots.site_maps ) paths_, latest = extract_sitemap_paths(site.base_url, urls) await add_site_paths(conn, site.id_, paths_) # store feeds and their resources await store_new_feeds(conn, site.id_, site.feeds) latest_ = await fetch_feeds(fetcher, conn, site) if latest_: latest = max(latest or latest_, latest_) # update last_pub if latest: site.last_pub = latest await site.save(conn) async def checkin_site(app, conn: Connection, site: Site, crawl: Crawl): """ Unlock the site and schedule next crawl. *crawl* is the crawl that has just finished (regularly or stopped). If the crawl was stopped (t_end is None), just unlock the site. Otherwise schedule a crawl of the same type. After a full crawl also a feed crawl is scheduled, if there was none scheduled. """ if crawl.t_end is None: sql = "UPDATE site SET crawl_active=false WHERE id=$1" await conn.execute(sql, site.id_) elif crawl.is_full: full_interval = app.config['crawl']['full_crawl_interval'] feed_interval = app.config['crawl']['feed_crawl_interval'] next_full_crawl = crawl.t_begin + timedelta(seconds=full_interval) next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval) sql = ( "UPDATE site SET crawl_active=false, next_full_crawl=$1," " next_feed_crawl=coalesce(next_feed_crawl, $2) WHERE id=$3" ) await conn.execute(sql, next_full_crawl, next_feed_crawl, site.id_) else: feed_interval = app.config['crawl']['feed_crawl_interval'] next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval) sql = ( "UPDATE site SET crawl_active=false, next_feed_crawl=$1" " WHERE id=$2" ) await conn.execute(sql, next_feed_crawl, site.id_) async def _store_cross_site_links( conn: Connection, site_id: int, links: dict[Durl, tuple[list[str], str]], ) -> None: """ Put outgoing links into site_link/site_queue for existing/unknown sites. Separate outgoing links from *site_id* into two classes: (a) existing sites (rows in table site) and (b) unknown links. Add links from class (a) to table site_link. Add links from class (b) to table site_queue. """ # add outgoing cross-site links for existing sites to table site_link urls = [url.site() for url in links.keys()] values = [] sql = "SELECT id, unnest(base_urls) url FROM site WHERE base_urls && $1" if rows := await conn.fetch(sql, urls): for row in rows: if (durl := await Durl(row['url'])) in links.keys(): _, link_text = links.pop(durl) if site_id != row['id']: values.append((site_id, row['id'], link_text)) sql = ( "INSERT INTO site_link (src, dst, link_text)" " VALUES ($1, $2, $3) ON CONFLICT (src, dst) DO NOTHING" ) await conn.executemany(sql, values) # add outgoing cross-site links for unknown sites to table site_queue sql = "INSERT INTO site_queue (src, url, link_text) VALUES ($1, $2, $3)" values = [ (site_id, durl.site()[:200], link_text[:100]) for durl, (_, link_text) in links.items() ] await conn.executemany(sql, values)