atextcrawler/src/atextcrawler/resource/feed.py

156 lines
5.1 KiB
Python

"""
Stuff related to feeds.
Higher-level stuff is in site.feeds.
"""
import logging
from datetime import datetime, timezone
from typing import Optional, Union
from asyncpg import Connection
from feedparser import parse
from ..models import Feed, MetaResource, ResourceError
from ..utils.durl import Durl
logger = logging.getLogger(__name__)
feed_types = (
'application/rss+xml',
'application/atom+xml',
'application/feed+json',
)
async def update_feed(fetcher, feed, conn) -> Optional[list[dict]]:
"""
Fetch, parse and return a given feed's content. Also update *feed*.
If the server replied with HTTP 410, delete the feed.
If there is no new information (server replied with HTTP 304),
return None. For other errors also return None and increase the
fail_count.
"""
headers = {'Cache-control': 'max-age=600'}
if feed.modified:
headers['If-Modified-Since'] = feed.modified
elif feed.etag:
headers['If-None-Match'] = feed.etag.removeprefix('W/')
resource = await fetcher.fetch(feed.url, headers=headers)
if isinstance(resource, ResourceError):
if resource.status == 410:
msg = f'Feed has vanished, deleting it: {feed}'
logger.debug(msg)
await feed.delete(conn)
if resource.status != 304:
feed.fail_count += 1
if feed.fail_count > 5:
msg = f'Feed not reachable, deleting it: {feed}'
logger.debug(msg)
await feed.delete(conn)
return None # HTTP 304, no new entries
elif isinstance(resource, Feed):
resource.id_ = feed.id_
resource.site_id = feed.site_id
await resource.save(conn)
return resource.entries
else:
return None
def parse_json_feed(resp, data: dict) -> Feed:
"""
Parse a JSON response for jsonfeed information.
TODO: handle 'next_url' (see https://jsonfeed.org/version/1.1)
"""
feed = Feed()
feed.url = data.get('feed_url', resp['redirects'][-1])
feed.etag = resp['headers'].get('ETag')
feed.modified = resp['headers'].get('Last-Modified')
feed.t_visit = datetime.utcnow()
version = data.get('version', '')
version = 'json-' + version.removeprefix('https://jsonfeed.org/version/')
feed.version = version[:10]
feed.title = data.get('title')
feed.description = data.get('description')
feed.fail_count = 0
entries = []
latest = None
# parse feed entries to a dict compatible with feedparser's entries
for feed_item in data.get('items', []):
entry = {}
entry['link'] = feed_item.get('url')
dt = feed_item.get('date_published')
if dt:
dt = datetime.fromisoformat(dt) if dt else None
dt = dt.astimezone(tz=None).replace(tzinfo=timezone.utc)
entry['published_parsed'] = dt.timetuple()
entry['title'] = feed_item.get('title')
entry['summary'] = feed_item.get('summary')
entries.append(entry)
if dt:
latest = max(latest or dt, dt)
feed.entries = entries
feed.t_content = latest
return feed
def parse_xml_feed(resp) -> Union[Feed, ResourceError]:
"""
Parse a response from Fetcher.get_resp() for xml feed information.
"""
feed = Feed()
feed.url = resp['redirects'][-1]
feed.etag = resp['headers'].get('ETag')
feed.modified = resp['headers'].get('Last-Modified')
feed.t_visit = datetime.utcnow()
try:
parsed = parse(resp['content'], response_headers=resp['headers'])
except Exception as error:
return ResourceError(f'Feedparser error: {error}')
latest = parsed['feed'].get('updated_parsed')
if latest:
latest = datetime(*latest[:6])
feed.t_content = max(feed.t_content or latest, latest)
feed.version = parsed['version']
feed.title = parsed['feed'].get('title', '')[:200] or None
feed.description = parsed['feed'].get('description')
feed.fail_count = 0
feed.entries = parsed['entries']
return feed
def convert_feed_entries(
base_url: Optional[str],
entries: list[dict],
) -> tuple[
list[tuple[str, bool]],
dict[str, tuple[Optional[str], Optional[str], Optional[str]]],
]:
"""
Extract paths and resource meta information from a feed's entries.
Return paths in a structure wanted by :func:`add_site_paths` and
resource meta information in a structure wanted by
:func:`update_resource_meta`.
"""
paths = []
resource_meta = {}
for entry in entries:
if entry.get('link') and entry['link'].startswith(base_url or ''):
path = entry['link'].removeprefix(base_url or '').lstrip('/')
if len(path) <= 200:
last_update = entry.get('published_parsed')
if last_update:
last_update = datetime(*last_update[:6])
paths.append((path, True))
resource_meta[path] = (
last_update,
entry.get('title', '')[:200] or None,
entry.get('summary', '')[:2000] or None,
)
return paths, resource_meta