atextcrawler/src/atextcrawler/site/operations.py

268 lines
9.5 KiB
Python

"""
Operations on sites.
"""
import logging
from datetime import datetime, timedelta
from typing import Optional
from asyncpg import Connection
from ..models import Crawl, Site, TextResource
from ..resource import (
add_site_paths,
extract_sitemap_paths,
get_sitemap_urls,
store_boilerplate_texts,
)
from ..utils.durl import Durl
from ..utils.similarity import get_simhash_index
from .feeds import fetch_feeds, store_new_feeds
from .parse import parse_startpage
from .robots import RobotsInfo
logger = logging.getLogger(__name__)
async def checkout_site(
config, conn: Connection
) -> tuple[Optional[int], bool, bool]:
"""
Get the id of a site to be crawled and mark it with crawl_active=true.
Also return whether the site shall be fully crawled; if not, this
means that just the resources from the feeds shall be crawled.
Also return whether more sites might be available.
"""
async with conn.transaction():
sql = (
"SELECT id, next_full_crawl < now() at time zone 'UTC' is_full"
" FROM site WHERE crawl_enabled AND crawl_active = false"
" AND (next_full_crawl < now() at time zone 'UTC'"
" OR next_feed_crawl < now() at time zone 'UTC')"
" LIMIT 1 FOR UPDATE SKIP LOCKED"
)
row = await conn.fetchrow(sql)
if row:
site_id = row['id']
is_full = row['is_full']
sql = "UPDATE site SET crawl_active = true WHERE id=$1"
await conn.execute(sql, site_id)
site = await Site().load(conn, site_id)
if site:
site.base_durl = await Durl(site.base_url)
if site.base_durl:
site.simhash_index = await get_simhash_index(conn, site_id)
return site, is_full, True
else:
# site not available; schedule next crawl
int_full = config['crawl']['full_crawl_interval']
int_feed = config['crawl']['feed_crawl_interval']
now = datetime.utcnow()
t_full = now + timedelta(seconds=int_full)
t_feed = now + timedelta(seconds=int_full + int_feed)
sql = (
"UPDATE site SET crawl_active=false,"
" next_full_crawl=$1, next_feed_crawl=$2"
" WHERE id=$3"
)
await conn.execute(sql, t_full, t_feed, site_id)
return None, False, True
return None, False, True
return None, False, False
async def update_site(
app, fetcher, conn: Connection, base_url, site: Site = None
) -> tuple[Optional[Site], bool]:
"""
Try to fetch base_url and return a site and whether a new one was created.
This function is run for all sites (including blacklisted and irrelevant
ones). It determines whether the site shall be crawled.
If an errors occurs, return (None, False), and if a site was given,
also set it to crawl_enabled=False and remove crawling schedules.
If base_url could be fetched, update the site, possibly creating
a new one.
If the site has crawl_enabled, and no full crawl is scheduled,
schedule one (by updating column `next_full_crawl`).
"""
# fetch startpage
logger.info(f'Updating site={site}, base_url={base_url}')
resource = await fetcher.fetch(base_url, site=site)
if (
not isinstance(resource, TextResource)
or resource.content_type != 'html'
):
if site:
site.meta_info['error'] = 'Invalid start page'
site.crawl_enabled = False
site.next_full_crawl = None
site.next_feed_crawl = None
await site.save(conn)
logger.info(f'Failed startpage {base_url}: {resource}')
return None, False
# parse startpage (extract site information) and save the site
site = await parse_startpage(resource, app=app, site=site)
site_id, created = await site.save(conn)
if created:
logger.debug(f'Created {site}')
# add black-/white-listing info
is_allowed = await is_site_allowed(conn, site.id_, base_url)
if is_allowed is not None and is_allowed != site.crawl_enabled:
site.crawl_enabled = is_allowed
await site.save(conn)
# schedule full crawl, if none is scheduled and the site shall be crawled
if site.crawl_enabled:
sql = (
"UPDATE site"
" SET next_full_crawl=now() at time zone 'UTC'"
" WHERE id=$1 AND next_full_crawl IS null"
)
await conn.execute(sql, site_id)
return site, created
async def is_site_allowed(
conn: Connection,
site_id: Optional[int],
base_url: str,
) -> Optional[bool]:
"""
Return True if the site is whitelisted, False if blacklisted, else None.
Also add missing site_ids to the annotations.
"""
sql = "SELECT * FROM site_annotation WHERE site_id=$1 OR base_url=$2"
anns = await conn.fetch(sql, site_id, base_url)
for ann in anns:
if ann['ann_type'] == 'blacklist':
return False
if ann['ann_type'] == 'whitelist':
return True
# add missing site_ids
if site_id and any([ann['site_id'] is None for ann in anns]):
sql = "UPDATE site_annotation SET site_id=$1 WHERE base_url=$2"
await conn.execute(sql, site_id, base_url)
return None
async def process_site(fetcher, conn: Connection, site: Site):
"""
Process a site: fetch and store more information.
Store external and internal links; find boilerplate texts;
fetch sitemaps; fetch feeds; update date of last publication.
"""
if not site.id_: # only to satisfy typing
return
if site.links_ext:
await _store_cross_site_links(conn, site.id_, site.links_ext)
if site.links_int:
paths = []
for durl, (rel, _) in site.links_int.items():
canon = (rel and rel.lower() == 'canonical') or None
paths.append((durl.pwa(), canon))
await add_site_paths(conn, site.id_, paths)
await store_boilerplate_texts(fetcher, conn, site)
# get sitemaps and add their resources
robots = await RobotsInfo(site.base_url) # type: ignore
urls = await get_sitemap_urls(
fetcher, site.base_url, sitemaps=robots.site_maps
)
paths_, latest = extract_sitemap_paths(site.base_url, urls)
await add_site_paths(conn, site.id_, paths_)
# store feeds and their resources
await store_new_feeds(conn, site.id_, site.feeds)
latest_ = await fetch_feeds(fetcher, conn, site)
if latest_:
latest = max(latest or latest_, latest_)
# update last_pub
if latest:
site.last_pub = latest
await site.save(conn)
async def checkin_site(app, conn: Connection, site: Site, crawl: Crawl):
"""
Unlock the site and schedule next crawl.
*crawl* is the crawl that has just finished (regularly or stopped).
If the crawl was stopped (t_end is None), just unlock the site.
Otherwise schedule a crawl of the same type. After a full crawl
also a feed crawl is scheduled, if there was none scheduled.
"""
if crawl.t_end is None:
sql = "UPDATE site SET crawl_active=false WHERE id=$1"
await conn.execute(sql, site.id_)
elif crawl.is_full:
full_interval = app.config['crawl']['full_crawl_interval']
feed_interval = app.config['crawl']['feed_crawl_interval']
next_full_crawl = crawl.t_begin + timedelta(seconds=full_interval)
next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
sql = (
"UPDATE site SET crawl_active=false, next_full_crawl=$1,"
" next_feed_crawl=coalesce(next_feed_crawl, $2) WHERE id=$3"
)
await conn.execute(sql, next_full_crawl, next_feed_crawl, site.id_)
else:
feed_interval = app.config['crawl']['feed_crawl_interval']
next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
sql = (
"UPDATE site SET crawl_active=false, next_feed_crawl=$1"
" WHERE id=$2"
)
await conn.execute(sql, next_feed_crawl, site.id_)
async def _store_cross_site_links(
conn: Connection,
site_id: int,
links: dict[Durl, tuple[list[str], str]],
) -> None:
"""
Put outgoing links into site_link/site_queue for existing/unknown sites.
Separate outgoing links from *site_id* into two classes:
(a) existing sites (rows in table site) and (b) unknown links.
Add links from class (a) to table site_link.
Add links from class (b) to table site_queue.
"""
# add outgoing cross-site links for existing sites to table site_link
urls = [url.site() for url in links.keys()]
values = []
sql = "SELECT id, unnest(base_urls) url FROM site WHERE base_urls && $1"
if rows := await conn.fetch(sql, urls):
for row in rows:
if (durl := await Durl(row['url'])) in links.keys():
_, link_text = links.pop(durl)
if site_id != row['id']:
values.append((site_id, row['id'], link_text))
sql = (
"INSERT INTO site_link (src, dst, link_text)"
" VALUES ($1, $2, $3) ON CONFLICT (src, dst) DO NOTHING"
)
await conn.executemany(sql, values)
# add outgoing cross-site links for unknown sites to table site_queue
sql = "INSERT INTO site_queue (src, url, link_text) VALUES ($1, $2, $3)"
values = [
(site_id, durl.site()[:200], link_text[:100])
for durl, (_, link_text) in links.items()
]
await conn.executemany(sql, values)