328 lines
11 KiB
Python
328 lines
11 KiB
Python
"""
|
|
Access to a resource specified by a URL.
|
|
"""
|
|
|
|
import gzip
|
|
import logging
|
|
from json import loads
|
|
from traceback import format_exc
|
|
from typing import Any, Optional, Union
|
|
|
|
import aiohttp
|
|
from bs4 import BeautifulSoup
|
|
|
|
from ..models import (
|
|
Feed,
|
|
MetaResource,
|
|
ResourceError,
|
|
ResourceRedirect,
|
|
Site,
|
|
TextResource,
|
|
)
|
|
from ..utils.durl import Durl
|
|
from ..utils.link import in_blacklist
|
|
from .document import parse_document
|
|
from .feed import parse_json_feed, parse_xml_feed
|
|
from .page import parse_html
|
|
from .plaintext import parse_plaintext
|
|
from .sitemap import parse_sitemap, parse_sitemapindex
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
MAX_REDIRECTS = 10
|
|
"""
|
|
Maximum number of redirects to follow.
|
|
"""
|
|
|
|
|
|
default_headers = {
|
|
'User-Agent': 'Mozilla/5.0 (X11; Linux aarch64; rv:78.0)'
|
|
' Gecko/20100101 Firefox/78.0',
|
|
'DNT': '1',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'Accept-Language': 'en-US,en;q=0.5, *;q=0.5',
|
|
}
|
|
"""
|
|
Default HTTP client headers, overwriting those of aiohttp.ClientSession.
|
|
"""
|
|
|
|
|
|
blacklist_content_types = [
|
|
'',
|
|
'application/ogg',
|
|
]
|
|
"""
|
|
Blacklist for content-types.
|
|
"""
|
|
|
|
|
|
text_content_types = {
|
|
'text/html': 'html',
|
|
'text/plain': 'plain',
|
|
'application/rss+xml': 'feed-rss',
|
|
'application/atom+xml': 'feed-atom',
|
|
'application/feed+json': 'feed-json',
|
|
'application/json': 'json',
|
|
'application/xml': 'xml',
|
|
'text/xml': 'xml',
|
|
}
|
|
"""
|
|
Map content-types to parsers.
|
|
"""
|
|
|
|
|
|
class ResourceFetcher:
|
|
"""
|
|
Fetch a resource specified by a URL (:meth:`fetch`).
|
|
|
|
The timeout is the same for all requests.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
session: aiohttp.ClientSession,
|
|
timeout_sock_connect: Union[int, float] = 8,
|
|
timeout_sock_read: Union[int, float] = 30,
|
|
):
|
|
self.session = session
|
|
self.timeout = aiohttp.ClientTimeout(
|
|
sock_connect=timeout_sock_connect, sock_read=timeout_sock_read
|
|
)
|
|
|
|
async def fetch(
|
|
self,
|
|
url: str,
|
|
site: Optional[Site] = None,
|
|
redirect_history: Optional[list[str]] = None,
|
|
headers: Optional[dict] = None,
|
|
) -> Union[
|
|
None, MetaResource, TextResource, ResourceError, ResourceRedirect
|
|
]:
|
|
"""
|
|
Try to fetch a resource and return an instance or error or redirect.
|
|
|
|
If an error was encountered, return a ResourceError.
|
|
If the resource has an irrelevant content type, return None.
|
|
Otherwise return a specific content instance.
|
|
|
|
Argument *redirect_history* contains the redirect history;
|
|
if one of the redirects is encountered again, return None.
|
|
"""
|
|
if redirect_history is None:
|
|
redirect_history = []
|
|
if not (durl := await Durl(url)):
|
|
return ResourceError('Invalid URL')
|
|
resp = await self.get_resp(
|
|
durl,
|
|
redirect_history=redirect_history,
|
|
headers=headers,
|
|
)
|
|
if isinstance(resp, ResourceError):
|
|
return resp
|
|
if resp is None:
|
|
return None
|
|
result = await self._parse(durl, site, resp)
|
|
if isinstance(result, (MetaResource, TextResource)):
|
|
result.id_ = None
|
|
return result
|
|
|
|
async def _parse(
|
|
self, durl, site, resp, in_recursion=False
|
|
) -> Union[
|
|
None, MetaResource, TextResource, ResourceError, ResourceRedirect
|
|
]:
|
|
"""
|
|
Parse a response. May call itself.
|
|
"""
|
|
result: Union[
|
|
None, MetaResource, TextResource, ResourceError, ResourceRedirect
|
|
] = None
|
|
content = resp['content']
|
|
if isinstance(content, str) and content.startswith('<?xml '):
|
|
result = await parse_xml(durl, resp)
|
|
elif resp['parser'] == 'feed-rss':
|
|
result = await parse_xml(durl, resp, rss=True)
|
|
elif resp['parser'] == 'feed-atom':
|
|
result = await parse_xml(durl, resp, atom=True)
|
|
elif resp['parser'] == 'xml':
|
|
result = await parse_xml(durl, resp)
|
|
elif resp['parser'] == 'html':
|
|
result = await parse_html(durl, resp, site)
|
|
elif resp['parser'] in ('json', 'feed-json'):
|
|
result = await parse_json(durl, resp)
|
|
elif resp['parser'] == 'plain':
|
|
result = await parse_plaintext(durl, resp, site)
|
|
elif resp['parser'] == 'application':
|
|
if resp['headers'].get('content-type') == 'application/x-gzip':
|
|
if in_recursion:
|
|
return None # consider nested gzip an attack
|
|
resp['content'] = gzip.decompress(resp['content'])
|
|
return await self._parse(durl, site, resp, in_recursion=True)
|
|
result = await parse_document(durl, resp, site)
|
|
if isinstance(result, ResourceRedirect):
|
|
redir_url = result.urls[-1]
|
|
result = await self.fetch(
|
|
redir_url,
|
|
site=site,
|
|
redirect_history=result.urls[:-1],
|
|
)
|
|
return result
|
|
|
|
async def get_resp(
|
|
self,
|
|
durl: Durl,
|
|
headers: dict = None,
|
|
redirect_history: Optional[list[str]] = None,
|
|
) -> Optional[Union[ResourceError, dict]]:
|
|
"""
|
|
Try to fetch a url returning a ResourceError or a dict with content.
|
|
|
|
Optional *headers* will overwrite the :var:`default_headers`.
|
|
|
|
If the response status is not 200, always return an ResourceError.
|
|
|
|
If the content-type is not relevant (see blacklist_content_types),
|
|
return None.
|
|
|
|
The dict contains these keys+values:
|
|
|
|
* 'parser': a hint on the parser to use for analyzing the content;
|
|
one of 'html', 'plain', 'feed', 'xml', 'application'
|
|
* 'content': bytes for type application, otherwise str
|
|
* 'redirects': a list of URLs visited during HTTP redirection,
|
|
the last item is the final URL
|
|
* 'headers': response headers
|
|
"""
|
|
if redirect_history is None:
|
|
redirect_history = []
|
|
if len(redirect_history) >= MAX_REDIRECTS:
|
|
return None
|
|
headers_ = default_headers.copy()
|
|
if headers:
|
|
headers_.update(headers)
|
|
try:
|
|
async with self.session.get(
|
|
durl.url(),
|
|
headers=headers_,
|
|
timeout=self.timeout,
|
|
) as resp:
|
|
redirects = [durl.url()]
|
|
if resp.history:
|
|
href = resp.history[-1].headers.get('location')
|
|
if not href or not (redurl := await Durl(href, base=durl)):
|
|
msg = 'Invalid URL after HTTP redirect'
|
|
return ResourceError(msg)
|
|
if in_blacklist(redurl.hostname):
|
|
src_url = (
|
|
redirect_history[0]
|
|
if redirect_history
|
|
else durl.url()
|
|
)
|
|
msg = (
|
|
f'Dropping URL {src_url}, since'
|
|
f' redirected to a blacklisted site'
|
|
)
|
|
logger.debug(msg)
|
|
return None
|
|
redirects = [str(r.url) for r in resp.history]
|
|
redirects.append(redurl.url())
|
|
if join := set(redirect_history) & set(redirects):
|
|
msg = f'Cyclic redirect {join}'
|
|
return ResourceError(msg)
|
|
if resp.status != 200:
|
|
msg = f'HTTP status {resp.status}'
|
|
return ResourceError(
|
|
msg, status=resp.status, headers=headers
|
|
)
|
|
c_type = resp.headers.get('content-type', '').split(';')[0]
|
|
if c_type in blacklist_content_types:
|
|
return None
|
|
result: dict[str, Any] = {
|
|
'redirects': redirect_history + redirects,
|
|
'headers': resp.headers,
|
|
}
|
|
if c_type in text_content_types.keys():
|
|
try: # catch decoding issues
|
|
content = await resp.text()
|
|
except:
|
|
body = await resp.read()
|
|
encoding = resp.charset or 'utf-8'
|
|
encoding = encoding.replace('CP-1250', 'cp1250')
|
|
content = body.decode(encoding, errors='replace')
|
|
result['content'] = content
|
|
result['parser'] = text_content_types[c_type]
|
|
return result
|
|
elif c_type.startswith('application/'):
|
|
result['content'] = await resp.read()
|
|
result['parser'] = 'application'
|
|
return result
|
|
except aiohttp.ClientError as error:
|
|
# on certificate error try without tls
|
|
if 'SSLCertVerificationError' in str(error):
|
|
if durl.scheme == 'https':
|
|
url = durl.url()
|
|
durl.replace_scheme('http')
|
|
response = await self.get_resp(
|
|
durl=durl,
|
|
headers=headers,
|
|
redirect_history=redirect_history + [url],
|
|
)
|
|
if not isinstance(response, ResourceError):
|
|
return response
|
|
msg = f'ClientError: {error}'
|
|
return ResourceError(msg)
|
|
except Exception as error:
|
|
msg = f'Unknown error: {error}:\n{format_exc()}'
|
|
logger.error(msg)
|
|
return ResourceError(msg)
|
|
return None
|
|
|
|
|
|
async def parse_xml(
|
|
durl: Durl,
|
|
response: dict,
|
|
rss=False,
|
|
atom=False,
|
|
) -> Optional[Union[MetaResource, ResourceError]]:
|
|
"""
|
|
Parse XML content.
|
|
|
|
In particular, parse sitemapindex, sitemap, RSS feed, atom feed.
|
|
"""
|
|
try:
|
|
xml = response['content']
|
|
soup = BeautifulSoup(xml, 'html.parser')
|
|
except:
|
|
return None
|
|
if rss or (rss := soup.find('rss')):
|
|
return parse_xml_feed(response)
|
|
elif atom or (atom := soup.find('atom')):
|
|
return parse_xml_feed(response)
|
|
elif sitemapindex := soup.find('sitemapindex'):
|
|
return parse_sitemapindex(sitemapindex)
|
|
elif urlset := soup.find('urlset'):
|
|
return parse_sitemap(urlset)
|
|
else:
|
|
return None
|
|
|
|
|
|
async def parse_json(
|
|
durl: Durl,
|
|
response: dict,
|
|
) -> Optional[Union[Feed, ResourceError]]:
|
|
"""
|
|
Parse the content of JSON feeds.
|
|
"""
|
|
try:
|
|
data = loads(response['content'])
|
|
except:
|
|
msg = f'Could not parse JSON from {durl.url()}'
|
|
logger.debug(msg)
|
|
return None
|
|
if not isinstance(data, dict):
|
|
return None
|
|
if data.get('version', '').startswith('https://jsonfeed.org/'):
|
|
return parse_json_feed(response, data)
|
|
return None
|