atextcrawler/src/atextcrawler/resource/fetch.py

328 lines
11 KiB
Python

"""
Access to a resource specified by a URL.
"""
import gzip
import logging
from json import loads
from traceback import format_exc
from typing import Any, Optional, Union
import aiohttp
from bs4 import BeautifulSoup
from ..models import (
Feed,
MetaResource,
ResourceError,
ResourceRedirect,
Site,
TextResource,
)
from ..utils.durl import Durl
from ..utils.link import in_blacklist
from .document import parse_document
from .feed import parse_json_feed, parse_xml_feed
from .page import parse_html
from .plaintext import parse_plaintext
from .sitemap import parse_sitemap, parse_sitemapindex
logger = logging.getLogger(__name__)
MAX_REDIRECTS = 10
"""
Maximum number of redirects to follow.
"""
default_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux aarch64; rv:78.0)'
' Gecko/20100101 Firefox/78.0',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'Accept-Language': 'en-US,en;q=0.5, *;q=0.5',
}
"""
Default HTTP client headers, overwriting those of aiohttp.ClientSession.
"""
blacklist_content_types = [
'',
'application/ogg',
]
"""
Blacklist for content-types.
"""
text_content_types = {
'text/html': 'html',
'text/plain': 'plain',
'application/rss+xml': 'feed-rss',
'application/atom+xml': 'feed-atom',
'application/feed+json': 'feed-json',
'application/json': 'json',
'application/xml': 'xml',
'text/xml': 'xml',
}
"""
Map content-types to parsers.
"""
class ResourceFetcher:
"""
Fetch a resource specified by a URL (:meth:`fetch`).
The timeout is the same for all requests.
"""
def __init__(
self,
session: aiohttp.ClientSession,
timeout_sock_connect: Union[int, float] = 8,
timeout_sock_read: Union[int, float] = 30,
):
self.session = session
self.timeout = aiohttp.ClientTimeout(
sock_connect=timeout_sock_connect, sock_read=timeout_sock_read
)
async def fetch(
self,
url: str,
site: Optional[Site] = None,
redirect_history: Optional[list[str]] = None,
headers: Optional[dict] = None,
) -> Union[
None, MetaResource, TextResource, ResourceError, ResourceRedirect
]:
"""
Try to fetch a resource and return an instance or error or redirect.
If an error was encountered, return a ResourceError.
If the resource has an irrelevant content type, return None.
Otherwise return a specific content instance.
Argument *redirect_history* contains the redirect history;
if one of the redirects is encountered again, return None.
"""
if redirect_history is None:
redirect_history = []
if not (durl := await Durl(url)):
return ResourceError('Invalid URL')
resp = await self.get_resp(
durl,
redirect_history=redirect_history,
headers=headers,
)
if isinstance(resp, ResourceError):
return resp
if resp is None:
return None
result = await self._parse(durl, site, resp)
if isinstance(result, (MetaResource, TextResource)):
result.id_ = None
return result
async def _parse(
self, durl, site, resp, in_recursion=False
) -> Union[
None, MetaResource, TextResource, ResourceError, ResourceRedirect
]:
"""
Parse a response. May call itself.
"""
result: Union[
None, MetaResource, TextResource, ResourceError, ResourceRedirect
] = None
content = resp['content']
if isinstance(content, str) and content.startswith('<?xml '):
result = await parse_xml(durl, resp)
elif resp['parser'] == 'feed-rss':
result = await parse_xml(durl, resp, rss=True)
elif resp['parser'] == 'feed-atom':
result = await parse_xml(durl, resp, atom=True)
elif resp['parser'] == 'xml':
result = await parse_xml(durl, resp)
elif resp['parser'] == 'html':
result = await parse_html(durl, resp, site)
elif resp['parser'] in ('json', 'feed-json'):
result = await parse_json(durl, resp)
elif resp['parser'] == 'plain':
result = await parse_plaintext(durl, resp, site)
elif resp['parser'] == 'application':
if resp['headers'].get('content-type') == 'application/x-gzip':
if in_recursion:
return None # consider nested gzip an attack
resp['content'] = gzip.decompress(resp['content'])
return await self._parse(durl, site, resp, in_recursion=True)
result = await parse_document(durl, resp, site)
if isinstance(result, ResourceRedirect):
redir_url = result.urls[-1]
result = await self.fetch(
redir_url,
site=site,
redirect_history=result.urls[:-1],
)
return result
async def get_resp(
self,
durl: Durl,
headers: dict = None,
redirect_history: Optional[list[str]] = None,
) -> Optional[Union[ResourceError, dict]]:
"""
Try to fetch a url returning a ResourceError or a dict with content.
Optional *headers* will overwrite the :var:`default_headers`.
If the response status is not 200, always return an ResourceError.
If the content-type is not relevant (see blacklist_content_types),
return None.
The dict contains these keys+values:
* 'parser': a hint on the parser to use for analyzing the content;
one of 'html', 'plain', 'feed', 'xml', 'application'
* 'content': bytes for type application, otherwise str
* 'redirects': a list of URLs visited during HTTP redirection,
the last item is the final URL
* 'headers': response headers
"""
if redirect_history is None:
redirect_history = []
if len(redirect_history) >= MAX_REDIRECTS:
return None
headers_ = default_headers.copy()
if headers:
headers_.update(headers)
try:
async with self.session.get(
durl.url(),
headers=headers_,
timeout=self.timeout,
) as resp:
redirects = [durl.url()]
if resp.history:
href = resp.history[-1].headers.get('location')
if not href or not (redurl := await Durl(href, base=durl)):
msg = 'Invalid URL after HTTP redirect'
return ResourceError(msg)
if in_blacklist(redurl.hostname):
src_url = (
redirect_history[0]
if redirect_history
else durl.url()
)
msg = (
f'Dropping URL {src_url}, since'
f' redirected to a blacklisted site'
)
logger.debug(msg)
return None
redirects = [str(r.url) for r in resp.history]
redirects.append(redurl.url())
if join := set(redirect_history) & set(redirects):
msg = f'Cyclic redirect {join}'
return ResourceError(msg)
if resp.status != 200:
msg = f'HTTP status {resp.status}'
return ResourceError(
msg, status=resp.status, headers=headers
)
c_type = resp.headers.get('content-type', '').split(';')[0]
if c_type in blacklist_content_types:
return None
result: dict[str, Any] = {
'redirects': redirect_history + redirects,
'headers': resp.headers,
}
if c_type in text_content_types.keys():
try: # catch decoding issues
content = await resp.text()
except:
body = await resp.read()
encoding = resp.charset or 'utf-8'
encoding = encoding.replace('CP-1250', 'cp1250')
content = body.decode(encoding, errors='replace')
result['content'] = content
result['parser'] = text_content_types[c_type]
return result
elif c_type.startswith('application/'):
result['content'] = await resp.read()
result['parser'] = 'application'
return result
except aiohttp.ClientError as error:
# on certificate error try without tls
if 'SSLCertVerificationError' in str(error):
if durl.scheme == 'https':
url = durl.url()
durl.replace_scheme('http')
response = await self.get_resp(
durl=durl,
headers=headers,
redirect_history=redirect_history + [url],
)
if not isinstance(response, ResourceError):
return response
msg = f'ClientError: {error}'
return ResourceError(msg)
except Exception as error:
msg = f'Unknown error: {error}:\n{format_exc()}'
logger.error(msg)
return ResourceError(msg)
return None
async def parse_xml(
durl: Durl,
response: dict,
rss=False,
atom=False,
) -> Optional[Union[MetaResource, ResourceError]]:
"""
Parse XML content.
In particular, parse sitemapindex, sitemap, RSS feed, atom feed.
"""
try:
xml = response['content']
soup = BeautifulSoup(xml, 'html.parser')
except:
return None
if rss or (rss := soup.find('rss')):
return parse_xml_feed(response)
elif atom or (atom := soup.find('atom')):
return parse_xml_feed(response)
elif sitemapindex := soup.find('sitemapindex'):
return parse_sitemapindex(sitemapindex)
elif urlset := soup.find('urlset'):
return parse_sitemap(urlset)
else:
return None
async def parse_json(
durl: Durl,
response: dict,
) -> Optional[Union[Feed, ResourceError]]:
"""
Parse the content of JSON feeds.
"""
try:
data = loads(response['content'])
except:
msg = f'Could not parse JSON from {durl.url()}'
logger.debug(msg)
return None
if not isinstance(data, dict):
return None
if data.get('version', '').startswith('https://jsonfeed.org/'):
return parse_json_feed(response, data)
return None