atextcrawler/src/atextcrawler/site/feeds.py

101 lines
3.2 KiB
Python

"""
High-level feed-related stuff.
See resource.feed for low-level stuff not primarily related to sites.
"""
from datetime import datetime
from typing import Optional
from ..models import Feed
from ..resource import store_feed_entries, update_feed
async def store_new_feeds(conn, site_id, feeds: dict):
"""
Store new feeds in table site_feed.
"""
sql = "SELECT array_agg(url) FROM site_feed WHERE site_id=$1"
known_feeds = (await conn.fetchval(sql, site_id)) or []
for feed_url in feeds.keys():
if feed_url not in known_feeds:
feed = Feed(
site_id=site_id,
url=feed_url,
)
await feed.save(conn)
async def get_feeds(conn, site_id) -> list[Feed]:
"""
Return stored feeds for the given site.
"""
sql = "SELECT * FROM site_feed WHERE site_id=$1"
rows = (await conn.fetch(sql, site_id)) or []
return [(await Feed().load_from_row(row)) for row in rows]
async def fetch_feeds(fetcher, conn, site) -> Optional[datetime]:
"""
Fetch feeds, add new resources and return the latest content update time.
"""
feeds = await get_feeds(conn, site.id_)
latest = None
for feed in feeds:
feed_content = await update_feed(fetcher, feed, conn)
if feed_content:
await store_feed_entries(conn, site, feed_content)
if feed.t_content:
latest = max(latest or feed.t_content, feed.t_content)
return latest
if __name__ == '__main__':
# only use this on a dev instance!
import asyncio
import logging
import sys
import aiohttp
from ..config import Config
from ..db import PGPool
from ..resource.fetch import ResourceFetcher
from .operations import process_site, update_site
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
config = Config().get()
url = sys.argv[1]
async def run():
"""
Fetch and display a site.
"""
app = None # TODO
async with PGPool(config['postgresql']) as pool:
async with pool.acquire() as conn:
async with aiohttp.ClientSession() as session:
fetcher = ResourceFetcher(session)
site, _ = await update_site(app, fetcher, conn, url)
logger.warning(site)
await process_site(fetcher, conn, site)
latest = await fetch_feeds(fetcher, conn, site)
logger.warning(f'latest: {latest}')
# feed = Feed(url=url)
# feed_content = await update_feed(fetcher, feed, conn)
# if isinstance(feed_content, ResourceError):
# print(feed_content)
# else:
# print(feed)
# pprint(feed_content[0])
# print('---- 2nd try ----')
# feed_content = await update_feed(fetcher, feed, conn)
# if isinstance(feed_content, ResourceError):
# print(feed_content)
# else:
# print(feed)
# pprint(feed_content[0])
asyncio.run(run())