156 lines
5.1 KiB
Python
156 lines
5.1 KiB
Python
"""
|
|
Stuff related to feeds.
|
|
|
|
Higher-level stuff is in site.feeds.
|
|
"""
|
|
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
from typing import Optional, Union
|
|
|
|
from asyncpg import Connection
|
|
from feedparser import parse
|
|
|
|
from ..models import Feed, MetaResource, ResourceError
|
|
from ..utils.durl import Durl
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
feed_types = (
|
|
'application/rss+xml',
|
|
'application/atom+xml',
|
|
'application/feed+json',
|
|
)
|
|
|
|
|
|
async def update_feed(fetcher, feed, conn) -> Optional[list[dict]]:
|
|
"""
|
|
Fetch, parse and return a given feed's content. Also update *feed*.
|
|
|
|
If the server replied with HTTP 410, delete the feed.
|
|
If there is no new information (server replied with HTTP 304),
|
|
return None. For other errors also return None and increase the
|
|
fail_count.
|
|
"""
|
|
headers = {'Cache-control': 'max-age=600'}
|
|
if feed.modified:
|
|
headers['If-Modified-Since'] = feed.modified
|
|
elif feed.etag:
|
|
headers['If-None-Match'] = feed.etag.removeprefix('W/')
|
|
resource = await fetcher.fetch(feed.url, headers=headers)
|
|
if isinstance(resource, ResourceError):
|
|
if resource.status == 410:
|
|
msg = f'Feed has vanished, deleting it: {feed}'
|
|
logger.debug(msg)
|
|
await feed.delete(conn)
|
|
if resource.status != 304:
|
|
feed.fail_count += 1
|
|
if feed.fail_count > 5:
|
|
msg = f'Feed not reachable, deleting it: {feed}'
|
|
logger.debug(msg)
|
|
await feed.delete(conn)
|
|
return None # HTTP 304, no new entries
|
|
elif isinstance(resource, Feed):
|
|
resource.id_ = feed.id_
|
|
resource.site_id = feed.site_id
|
|
await resource.save(conn)
|
|
return resource.entries
|
|
else:
|
|
return None
|
|
|
|
|
|
def parse_json_feed(resp, data: dict) -> Feed:
|
|
"""
|
|
Parse a JSON response for jsonfeed information.
|
|
|
|
TODO: handle 'next_url' (see https://jsonfeed.org/version/1.1)
|
|
"""
|
|
feed = Feed()
|
|
feed.url = data.get('feed_url', resp['redirects'][-1])
|
|
feed.etag = resp['headers'].get('ETag')
|
|
feed.modified = resp['headers'].get('Last-Modified')
|
|
feed.t_visit = datetime.utcnow()
|
|
version = data.get('version', '')
|
|
version = 'json-' + version.removeprefix('https://jsonfeed.org/version/')
|
|
feed.version = version[:10]
|
|
feed.title = data.get('title')
|
|
feed.description = data.get('description')
|
|
feed.fail_count = 0
|
|
entries = []
|
|
latest = None
|
|
# parse feed entries to a dict compatible with feedparser's entries
|
|
for feed_item in data.get('items', []):
|
|
entry = {}
|
|
entry['link'] = feed_item.get('url')
|
|
dt = feed_item.get('date_published')
|
|
if dt:
|
|
dt = datetime.fromisoformat(dt) if dt else None
|
|
dt = dt.astimezone(tz=None).replace(tzinfo=timezone.utc)
|
|
entry['published_parsed'] = dt.timetuple()
|
|
entry['title'] = feed_item.get('title')
|
|
entry['summary'] = feed_item.get('summary')
|
|
entries.append(entry)
|
|
if dt:
|
|
latest = max(latest or dt, dt)
|
|
feed.entries = entries
|
|
feed.t_content = latest
|
|
return feed
|
|
|
|
|
|
def parse_xml_feed(resp) -> Union[Feed, ResourceError]:
|
|
"""
|
|
Parse a response from Fetcher.get_resp() for xml feed information.
|
|
"""
|
|
feed = Feed()
|
|
feed.url = resp['redirects'][-1]
|
|
feed.etag = resp['headers'].get('ETag')
|
|
feed.modified = resp['headers'].get('Last-Modified')
|
|
feed.t_visit = datetime.utcnow()
|
|
try:
|
|
parsed = parse(resp['content'], response_headers=resp['headers'])
|
|
except Exception as error:
|
|
return ResourceError(f'Feedparser error: {error}')
|
|
latest = parsed['feed'].get('updated_parsed')
|
|
if latest:
|
|
latest = datetime(*latest[:6])
|
|
feed.t_content = max(feed.t_content or latest, latest)
|
|
feed.version = parsed['version']
|
|
feed.title = parsed['feed'].get('title', '')[:200] or None
|
|
feed.description = parsed['feed'].get('description')
|
|
feed.fail_count = 0
|
|
feed.entries = parsed['entries']
|
|
return feed
|
|
|
|
|
|
def convert_feed_entries(
|
|
base_url: Optional[str],
|
|
entries: list[dict],
|
|
) -> tuple[
|
|
list[tuple[str, bool]],
|
|
dict[str, tuple[Optional[str], Optional[str], Optional[str]]],
|
|
]:
|
|
"""
|
|
Extract paths and resource meta information from a feed's entries.
|
|
|
|
Return paths in a structure wanted by :func:`add_site_paths` and
|
|
resource meta information in a structure wanted by
|
|
:func:`update_resource_meta`.
|
|
"""
|
|
paths = []
|
|
resource_meta = {}
|
|
for entry in entries:
|
|
if entry.get('link') and entry['link'].startswith(base_url or ''):
|
|
path = entry['link'].removeprefix(base_url or '').lstrip('/')
|
|
if len(path) <= 200:
|
|
last_update = entry.get('published_parsed')
|
|
if last_update:
|
|
last_update = datetime(*last_update[:6])
|
|
paths.append((path, True))
|
|
resource_meta[path] = (
|
|
last_update,
|
|
entry.get('title', '')[:200] or None,
|
|
entry.get('summary', '')[:2000] or None,
|
|
)
|
|
return paths, resource_meta
|