""" Access to a resource specified by a URL. """ import gzip import logging from json import loads from traceback import format_exc from typing import Any, Optional, Union import aiohttp from bs4 import BeautifulSoup from ..models import ( Feed, MetaResource, ResourceError, ResourceRedirect, Site, TextResource, ) from ..utils.durl import Durl from ..utils.link import in_blacklist from .document import parse_document from .feed import parse_json_feed, parse_xml_feed from .page import parse_html from .plaintext import parse_plaintext from .sitemap import parse_sitemap, parse_sitemapindex logger = logging.getLogger(__name__) MAX_REDIRECTS = 10 """ Maximum number of redirects to follow. """ default_headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux aarch64; rv:78.0)' ' Gecko/20100101 Firefox/78.0', 'DNT': '1', 'Upgrade-Insecure-Requests': '1', 'Accept-Language': 'en-US,en;q=0.5, *;q=0.5', } """ Default HTTP client headers, overwriting those of aiohttp.ClientSession. """ blacklist_content_types = [ '', 'application/ogg', ] """ Blacklist for content-types. """ text_content_types = { 'text/html': 'html', 'text/plain': 'plain', 'application/rss+xml': 'feed-rss', 'application/atom+xml': 'feed-atom', 'application/feed+json': 'feed-json', 'application/json': 'json', 'application/xml': 'xml', 'text/xml': 'xml', } """ Map content-types to parsers. """ class ResourceFetcher: """ Fetch a resource specified by a URL (:meth:`fetch`). The timeout is the same for all requests. """ def __init__( self, session: aiohttp.ClientSession, timeout_sock_connect: Union[int, float] = 8, timeout_sock_read: Union[int, float] = 30, ): self.session = session self.timeout = aiohttp.ClientTimeout( sock_connect=timeout_sock_connect, sock_read=timeout_sock_read ) async def fetch( self, url: str, site: Optional[Site] = None, redirect_history: Optional[list[str]] = None, headers: Optional[dict] = None, ) -> Union[ None, MetaResource, TextResource, ResourceError, ResourceRedirect ]: """ Try to fetch a resource and return an instance or error or redirect. If an error was encountered, return a ResourceError. If the resource has an irrelevant content type, return None. Otherwise return a specific content instance. Argument *redirect_history* contains the redirect history; if one of the redirects is encountered again, return None. """ if redirect_history is None: redirect_history = [] if not (durl := await Durl(url)): return ResourceError('Invalid URL') resp = await self.get_resp( durl, redirect_history=redirect_history, headers=headers, ) if isinstance(resp, ResourceError): return resp if resp is None: return None result = await self._parse(durl, site, resp) if isinstance(result, (MetaResource, TextResource)): result.id_ = None return result async def _parse( self, durl, site, resp, in_recursion=False ) -> Union[ None, MetaResource, TextResource, ResourceError, ResourceRedirect ]: """ Parse a response. May call itself. """ result: Union[ None, MetaResource, TextResource, ResourceError, ResourceRedirect ] = None content = resp['content'] if isinstance(content, str) and content.startswith(' Optional[Union[ResourceError, dict]]: """ Try to fetch a url returning a ResourceError or a dict with content. Optional *headers* will overwrite the :var:`default_headers`. If the response status is not 200, always return an ResourceError. If the content-type is not relevant (see blacklist_content_types), return None. The dict contains these keys+values: * 'parser': a hint on the parser to use for analyzing the content; one of 'html', 'plain', 'feed', 'xml', 'application' * 'content': bytes for type application, otherwise str * 'redirects': a list of URLs visited during HTTP redirection, the last item is the final URL * 'headers': response headers """ if redirect_history is None: redirect_history = [] if len(redirect_history) >= MAX_REDIRECTS: return None headers_ = default_headers.copy() if headers: headers_.update(headers) try: async with self.session.get( durl.url(), headers=headers_, timeout=self.timeout, ) as resp: redirects = [durl.url()] if resp.history: href = resp.history[-1].headers.get('location') if not href or not (redurl := await Durl(href, base=durl)): msg = 'Invalid URL after HTTP redirect' return ResourceError(msg) if in_blacklist(redurl.hostname): src_url = ( redirect_history[0] if redirect_history else durl.url() ) msg = ( f'Dropping URL {src_url}, since' f' redirected to a blacklisted site' ) logger.debug(msg) return None redirects = [str(r.url) for r in resp.history] redirects.append(redurl.url()) if join := set(redirect_history) & set(redirects): msg = f'Cyclic redirect {join}' return ResourceError(msg) if resp.status != 200: msg = f'HTTP status {resp.status}' return ResourceError( msg, status=resp.status, headers=headers ) c_type = resp.headers.get('content-type', '').split(';')[0] if c_type in blacklist_content_types: return None result: dict[str, Any] = { 'redirects': redirect_history + redirects, 'headers': resp.headers, } if c_type in text_content_types.keys(): try: # catch decoding issues content = await resp.text() except: body = await resp.read() encoding = resp.charset or 'utf-8' encoding = encoding.replace('CP-1250', 'cp1250') content = body.decode(encoding, errors='replace') result['content'] = content result['parser'] = text_content_types[c_type] return result elif c_type.startswith('application/'): result['content'] = await resp.read() result['parser'] = 'application' return result except aiohttp.ClientError as error: # on certificate error try without tls if 'SSLCertVerificationError' in str(error): if durl.scheme == 'https': url = durl.url() durl.replace_scheme('http') response = await self.get_resp( durl=durl, headers=headers, redirect_history=redirect_history + [url], ) if not isinstance(response, ResourceError): return response msg = f'ClientError: {error}' return ResourceError(msg) except Exception as error: msg = f'Unknown error: {error}:\n{format_exc()}' logger.error(msg) return ResourceError(msg) return None async def parse_xml( durl: Durl, response: dict, rss=False, atom=False, ) -> Optional[Union[MetaResource, ResourceError]]: """ Parse XML content. In particular, parse sitemapindex, sitemap, RSS feed, atom feed. """ try: xml = response['content'] soup = BeautifulSoup(xml, 'html.parser') except: return None if rss or (rss := soup.find('rss')): return parse_xml_feed(response) elif atom or (atom := soup.find('atom')): return parse_xml_feed(response) elif sitemapindex := soup.find('sitemapindex'): return parse_sitemapindex(sitemapindex) elif urlset := soup.find('urlset'): return parse_sitemap(urlset) else: return None async def parse_json( durl: Durl, response: dict, ) -> Optional[Union[Feed, ResourceError]]: """ Parse the content of JSON feeds. """ try: data = loads(response['content']) except: msg = f'Could not parse JSON from {durl.url()}' logger.debug(msg) return None if not isinstance(data, dict): return None if data.get('version', '').startswith('https://jsonfeed.org/'): return parse_json_feed(response, data) return None