268 lines
9.5 KiB
Python
268 lines
9.5 KiB
Python
"""
|
|
Operations on sites.
|
|
"""
|
|
|
|
import logging
|
|
from datetime import datetime, timedelta
|
|
from typing import Optional
|
|
|
|
from asyncpg import Connection
|
|
|
|
from ..models import Crawl, Site, TextResource
|
|
from ..resource import (
|
|
add_site_paths,
|
|
extract_sitemap_paths,
|
|
get_sitemap_urls,
|
|
store_boilerplate_texts,
|
|
)
|
|
from ..utils.durl import Durl
|
|
from ..utils.similarity import get_simhash_index
|
|
from .feeds import fetch_feeds, store_new_feeds
|
|
from .parse import parse_startpage
|
|
from .robots import RobotsInfo
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def checkout_site(
|
|
config, conn: Connection
|
|
) -> tuple[Optional[int], bool, bool]:
|
|
"""
|
|
Get the id of a site to be crawled and mark it with crawl_active=true.
|
|
|
|
Also return whether the site shall be fully crawled; if not, this
|
|
means that just the resources from the feeds shall be crawled.
|
|
|
|
Also return whether more sites might be available.
|
|
"""
|
|
async with conn.transaction():
|
|
sql = (
|
|
"SELECT id, next_full_crawl < now() at time zone 'UTC' is_full"
|
|
" FROM site WHERE crawl_enabled AND crawl_active = false"
|
|
" AND (next_full_crawl < now() at time zone 'UTC'"
|
|
" OR next_feed_crawl < now() at time zone 'UTC')"
|
|
" LIMIT 1 FOR UPDATE SKIP LOCKED"
|
|
)
|
|
row = await conn.fetchrow(sql)
|
|
if row:
|
|
site_id = row['id']
|
|
is_full = row['is_full']
|
|
sql = "UPDATE site SET crawl_active = true WHERE id=$1"
|
|
await conn.execute(sql, site_id)
|
|
site = await Site().load(conn, site_id)
|
|
if site:
|
|
site.base_durl = await Durl(site.base_url)
|
|
if site.base_durl:
|
|
site.simhash_index = await get_simhash_index(conn, site_id)
|
|
return site, is_full, True
|
|
else:
|
|
# site not available; schedule next crawl
|
|
int_full = config['crawl']['full_crawl_interval']
|
|
int_feed = config['crawl']['feed_crawl_interval']
|
|
now = datetime.utcnow()
|
|
t_full = now + timedelta(seconds=int_full)
|
|
t_feed = now + timedelta(seconds=int_full + int_feed)
|
|
sql = (
|
|
"UPDATE site SET crawl_active=false,"
|
|
" next_full_crawl=$1, next_feed_crawl=$2"
|
|
" WHERE id=$3"
|
|
)
|
|
await conn.execute(sql, t_full, t_feed, site_id)
|
|
return None, False, True
|
|
return None, False, True
|
|
return None, False, False
|
|
|
|
|
|
async def update_site(
|
|
app, fetcher, conn: Connection, base_url, site: Site = None
|
|
) -> tuple[Optional[Site], bool]:
|
|
"""
|
|
Try to fetch base_url and return a site and whether a new one was created.
|
|
|
|
This function is run for all sites (including blacklisted and irrelevant
|
|
ones). It determines whether the site shall be crawled.
|
|
|
|
If an errors occurs, return (None, False), and if a site was given,
|
|
also set it to crawl_enabled=False and remove crawling schedules.
|
|
|
|
If base_url could be fetched, update the site, possibly creating
|
|
a new one.
|
|
|
|
If the site has crawl_enabled, and no full crawl is scheduled,
|
|
schedule one (by updating column `next_full_crawl`).
|
|
"""
|
|
# fetch startpage
|
|
logger.info(f'Updating site={site}, base_url={base_url}')
|
|
resource = await fetcher.fetch(base_url, site=site)
|
|
if (
|
|
not isinstance(resource, TextResource)
|
|
or resource.content_type != 'html'
|
|
):
|
|
if site:
|
|
site.meta_info['error'] = 'Invalid start page'
|
|
site.crawl_enabled = False
|
|
site.next_full_crawl = None
|
|
site.next_feed_crawl = None
|
|
await site.save(conn)
|
|
logger.info(f'Failed startpage {base_url}: {resource}')
|
|
return None, False
|
|
|
|
# parse startpage (extract site information) and save the site
|
|
site = await parse_startpage(resource, app=app, site=site)
|
|
site_id, created = await site.save(conn)
|
|
if created:
|
|
logger.debug(f'Created {site}')
|
|
|
|
# add black-/white-listing info
|
|
is_allowed = await is_site_allowed(conn, site.id_, base_url)
|
|
if is_allowed is not None and is_allowed != site.crawl_enabled:
|
|
site.crawl_enabled = is_allowed
|
|
await site.save(conn)
|
|
|
|
# schedule full crawl, if none is scheduled and the site shall be crawled
|
|
if site.crawl_enabled:
|
|
sql = (
|
|
"UPDATE site"
|
|
" SET next_full_crawl=now() at time zone 'UTC'"
|
|
" WHERE id=$1 AND next_full_crawl IS null"
|
|
)
|
|
await conn.execute(sql, site_id)
|
|
|
|
return site, created
|
|
|
|
|
|
async def is_site_allowed(
|
|
conn: Connection,
|
|
site_id: Optional[int],
|
|
base_url: str,
|
|
) -> Optional[bool]:
|
|
"""
|
|
Return True if the site is whitelisted, False if blacklisted, else None.
|
|
|
|
Also add missing site_ids to the annotations.
|
|
"""
|
|
sql = "SELECT * FROM site_annotation WHERE site_id=$1 OR base_url=$2"
|
|
anns = await conn.fetch(sql, site_id, base_url)
|
|
for ann in anns:
|
|
if ann['ann_type'] == 'blacklist':
|
|
return False
|
|
if ann['ann_type'] == 'whitelist':
|
|
return True
|
|
# add missing site_ids
|
|
if site_id and any([ann['site_id'] is None for ann in anns]):
|
|
sql = "UPDATE site_annotation SET site_id=$1 WHERE base_url=$2"
|
|
await conn.execute(sql, site_id, base_url)
|
|
return None
|
|
|
|
|
|
async def process_site(fetcher, conn: Connection, site: Site):
|
|
"""
|
|
Process a site: fetch and store more information.
|
|
|
|
Store external and internal links; find boilerplate texts;
|
|
fetch sitemaps; fetch feeds; update date of last publication.
|
|
"""
|
|
if not site.id_: # only to satisfy typing
|
|
return
|
|
if site.links_ext:
|
|
await _store_cross_site_links(conn, site.id_, site.links_ext)
|
|
if site.links_int:
|
|
paths = []
|
|
for durl, (rel, _) in site.links_int.items():
|
|
canon = (rel and rel.lower() == 'canonical') or None
|
|
paths.append((durl.pwa(), canon))
|
|
await add_site_paths(conn, site.id_, paths)
|
|
|
|
await store_boilerplate_texts(fetcher, conn, site)
|
|
|
|
# get sitemaps and add their resources
|
|
robots = await RobotsInfo(site.base_url) # type: ignore
|
|
urls = await get_sitemap_urls(
|
|
fetcher, site.base_url, sitemaps=robots.site_maps
|
|
)
|
|
paths_, latest = extract_sitemap_paths(site.base_url, urls)
|
|
await add_site_paths(conn, site.id_, paths_)
|
|
|
|
# store feeds and their resources
|
|
await store_new_feeds(conn, site.id_, site.feeds)
|
|
latest_ = await fetch_feeds(fetcher, conn, site)
|
|
if latest_:
|
|
latest = max(latest or latest_, latest_)
|
|
|
|
# update last_pub
|
|
if latest:
|
|
site.last_pub = latest
|
|
await site.save(conn)
|
|
|
|
|
|
async def checkin_site(app, conn: Connection, site: Site, crawl: Crawl):
|
|
"""
|
|
Unlock the site and schedule next crawl.
|
|
|
|
*crawl* is the crawl that has just finished (regularly or stopped).
|
|
|
|
If the crawl was stopped (t_end is None), just unlock the site.
|
|
|
|
Otherwise schedule a crawl of the same type. After a full crawl
|
|
also a feed crawl is scheduled, if there was none scheduled.
|
|
"""
|
|
if crawl.t_end is None:
|
|
sql = "UPDATE site SET crawl_active=false WHERE id=$1"
|
|
await conn.execute(sql, site.id_)
|
|
elif crawl.is_full:
|
|
full_interval = app.config['crawl']['full_crawl_interval']
|
|
feed_interval = app.config['crawl']['feed_crawl_interval']
|
|
next_full_crawl = crawl.t_begin + timedelta(seconds=full_interval)
|
|
next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
|
|
sql = (
|
|
"UPDATE site SET crawl_active=false, next_full_crawl=$1,"
|
|
" next_feed_crawl=coalesce(next_feed_crawl, $2) WHERE id=$3"
|
|
)
|
|
await conn.execute(sql, next_full_crawl, next_feed_crawl, site.id_)
|
|
else:
|
|
feed_interval = app.config['crawl']['feed_crawl_interval']
|
|
next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
|
|
sql = (
|
|
"UPDATE site SET crawl_active=false, next_feed_crawl=$1"
|
|
" WHERE id=$2"
|
|
)
|
|
await conn.execute(sql, next_feed_crawl, site.id_)
|
|
|
|
|
|
async def _store_cross_site_links(
|
|
conn: Connection,
|
|
site_id: int,
|
|
links: dict[Durl, tuple[list[str], str]],
|
|
) -> None:
|
|
"""
|
|
Put outgoing links into site_link/site_queue for existing/unknown sites.
|
|
|
|
Separate outgoing links from *site_id* into two classes:
|
|
(a) existing sites (rows in table site) and (b) unknown links.
|
|
Add links from class (a) to table site_link.
|
|
Add links from class (b) to table site_queue.
|
|
"""
|
|
# add outgoing cross-site links for existing sites to table site_link
|
|
urls = [url.site() for url in links.keys()]
|
|
values = []
|
|
sql = "SELECT id, unnest(base_urls) url FROM site WHERE base_urls && $1"
|
|
if rows := await conn.fetch(sql, urls):
|
|
for row in rows:
|
|
if (durl := await Durl(row['url'])) in links.keys():
|
|
_, link_text = links.pop(durl)
|
|
if site_id != row['id']:
|
|
values.append((site_id, row['id'], link_text))
|
|
sql = (
|
|
"INSERT INTO site_link (src, dst, link_text)"
|
|
" VALUES ($1, $2, $3) ON CONFLICT (src, dst) DO NOTHING"
|
|
)
|
|
await conn.executemany(sql, values)
|
|
|
|
# add outgoing cross-site links for unknown sites to table site_queue
|
|
sql = "INSERT INTO site_queue (src, url, link_text) VALUES ($1, $2, $3)"
|
|
values = [
|
|
(site_id, durl.site()[:200], link_text[:100])
|
|
for durl, (_, link_text) in links.items()
|
|
]
|
|
await conn.executemany(sql, values)
|