128 lines
4.2 KiB
Python
128 lines
4.2 KiB
Python
"""
|
|
Queue of sites.
|
|
|
|
When processing a resource, its external links are put into database table
|
|
`site_queue`.
|
|
The items in `site_queue` are processed in :func:`process_site_queue`.
|
|
This is done baseURL by baseURL (see :func:`iter_site_queue`).
|
|
While doing this, cross-site links are put into table `site_link`.
|
|
"""
|
|
|
|
import logging
|
|
from typing import AsyncIterator, Optional
|
|
|
|
import aiohttp
|
|
from asyncpg import Connection
|
|
|
|
from ..resource import ResourceFetcher
|
|
from .operations import update_site
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def process_site_queue(app, pool):
|
|
"""
|
|
Loop over queued sites creating new sites and adding cross-site links.
|
|
"""
|
|
site_delay = app.config['crawl']['site_delay']
|
|
resource_delay = app.config['crawl']['resource_delay']
|
|
async with pool.acquire() as conn:
|
|
async with aiohttp.ClientSession() as session:
|
|
fetcher = ResourceFetcher(session)
|
|
while app.running:
|
|
async for base_url, links_from in iter_site_queue(app, conn):
|
|
# get or create site
|
|
msg = f'Site queue: updating {base_url}'
|
|
logger.debug(msg)
|
|
site, created = await update_site(
|
|
app, fetcher, conn, base_url
|
|
)
|
|
if site:
|
|
await store_incoming_site_site_links(
|
|
conn, site.id_, links_from
|
|
)
|
|
# delete handled queue items
|
|
sql = "DELETE FROM site_queue WHERE url=$1"
|
|
await conn.execute(sql, base_url)
|
|
await app.sleep(resource_delay)
|
|
logger.debug(
|
|
f'Queued sites exhausted, sleeping'
|
|
f' for {site_delay} seconds'
|
|
)
|
|
await app.sleep(site_delay)
|
|
|
|
|
|
async def iter_site_queue(
|
|
app, conn: Connection
|
|
) -> AsyncIterator[tuple[str, dict[int, str]]]:
|
|
"""
|
|
Yield URLs with aggregated link information from site_queue.
|
|
|
|
Yield a URL and a dict mapping ids of linking sites to link texts.
|
|
"""
|
|
site_revisit_interval = app.config['crawl']['site_revisit_interval']
|
|
while app.running:
|
|
sql = (
|
|
"SELECT url, array_agg(src) srcs,"
|
|
" array_agg(link_text) link_texts"
|
|
" FROM site_queue GROUP BY url LIMIT 1"
|
|
)
|
|
row = await conn.fetchrow(sql)
|
|
if row:
|
|
base_url = row['url']
|
|
links_from = {}
|
|
srcs = row['srcs']
|
|
link_texts = row['link_texts']
|
|
for i in range(len(srcs)):
|
|
if src := srcs[i]:
|
|
links_from[src] = link_texts[i]
|
|
if site_id := await site_recently_updated(
|
|
conn, base_url, site_revisit_interval
|
|
):
|
|
# just store incoming links and remove the site from the queue
|
|
await store_incoming_site_site_links(conn, site_id, links_from)
|
|
sql = "DELETE FROM site_queue WHERE url=$1"
|
|
await conn.execute(sql, base_url)
|
|
else:
|
|
yield base_url, links_from
|
|
else:
|
|
break
|
|
|
|
|
|
async def site_recently_updated(
|
|
conn: Connection,
|
|
base_url: str,
|
|
site_revisit_interval: float,
|
|
) -> Optional[int]:
|
|
"""
|
|
Return the id of the site with given base_url if it was updated recently.
|
|
"""
|
|
sql = (
|
|
f"SELECT id FROM site WHERE $1=any(base_urls)"
|
|
f" AND last_update + interval '{site_revisit_interval} seconds'"
|
|
f" > now() at time zone 'utc' LIMIT 1"
|
|
)
|
|
site_id = await conn.fetchval(sql, base_url)
|
|
return site_id
|
|
|
|
|
|
async def store_incoming_site_site_links(
|
|
conn: Connection, site_id: int, links_from: dict
|
|
):
|
|
"""
|
|
Store incoming site-site links (irrespective of crawl_enabled).
|
|
|
|
*site_id* is the id of the site to which the links in *links_from* point.
|
|
"""
|
|
sql = (
|
|
"INSERT INTO site_link"
|
|
" (src, dst, link_text) VALUES ($1, $2, $3)"
|
|
" ON CONFLICT (src, dst) DO NOTHING"
|
|
)
|
|
values = [
|
|
(from_id, site_id, link_text)
|
|
for from_id, link_text in links_from.items()
|
|
if from_id != site_id
|
|
]
|
|
await conn.executemany(sql, values)
|