atextcrawler/src/atextcrawler/resource/fetch.py

"""
Access to a resource specified by a URL.
"""

import gzip
import logging
from json import loads
from traceback import format_exc
from typing import Any, Optional, Union

import aiohttp
from bs4 import BeautifulSoup

from ..models import (
    Feed,
    MetaResource,
    ResourceError,
    ResourceRedirect,
    Site,
    TextResource,
)
from ..utils.durl import Durl
from ..utils.link import in_blacklist
from .document import parse_document
from .feed import parse_json_feed, parse_xml_feed
from .page import parse_html
from .plaintext import parse_plaintext
from .sitemap import parse_sitemap, parse_sitemapindex

logger = logging.getLogger(__name__)


MAX_REDIRECTS = 10
"""
Maximum number of redirects to follow.
"""


default_headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux aarch64; rv:78.0)'
    ' Gecko/20100101 Firefox/78.0',
    'DNT': '1',
    'Upgrade-Insecure-Requests': '1',
    'Accept-Language': 'en-US,en;q=0.5, *;q=0.5',
}
"""
Default HTTP client headers, overwriting those of aiohttp.ClientSession.
"""


blacklist_content_types = [
    '',
    'application/ogg',
]
"""
Blacklist for content-types.
"""


text_content_types = {
    'text/html': 'html',
    'text/plain': 'plain',
    'application/rss+xml': 'feed-rss',
    'application/atom+xml': 'feed-atom',
    'application/feed+json': 'feed-json',
    'application/json': 'json',
    'application/xml': 'xml',
    'text/xml': 'xml',
}
"""
Map content-types to parsers.
"""


class ResourceFetcher:
    """
    Fetch a resource specified by a URL (:meth:`fetch`).

    The timeout is the same for all requests.
    """

    def __init__(
        self,
        session: aiohttp.ClientSession,
        timeout_sock_connect: Union[int, float] = 8,
        timeout_sock_read: Union[int, float] = 30,
    ):
        self.session = session
        self.timeout = aiohttp.ClientTimeout(
            sock_connect=timeout_sock_connect, sock_read=timeout_sock_read
        )

    async def fetch(
        self,
        url: str,
        site: Optional[Site] = None,
        redirect_history: Optional[list[str]] = None,
        headers: Optional[dict] = None,
    ) -> Union[
        None, MetaResource, TextResource, ResourceError, ResourceRedirect
    ]:
        """
        Try to fetch a resource and return an instance or error or redirect.

        If an error was encountered, return a ResourceError.
        If the resource has an irrelevant content type, return None.
        Otherwise return a specific content instance.

        Argument *redirect_history* contains the redirect history;
        if one of the redirects is encountered again, return None.
        """
        if redirect_history is None:
            redirect_history = []
        if not (durl := await Durl(url)):
            return ResourceError('Invalid URL')
        resp = await self.get_resp(
            durl,
            redirect_history=redirect_history,
            headers=headers,
        )
        if isinstance(resp, ResourceError):
            return resp
        if resp is None:
            return None
        result = await self._parse(durl, site, resp)
        if isinstance(result, (MetaResource, TextResource)):
            result.id_ = None
        return result

    async def _parse(
        self, durl, site, resp, in_recursion=False
    ) -> Union[
        None, MetaResource, TextResource, ResourceError, ResourceRedirect
    ]:
        """
        Parse a response. May call itself.
        """
        result: Union[
            None, MetaResource, TextResource, ResourceError, ResourceRedirect
        ] = None
        content = resp['content']
        if isinstance(content, str) and content.startswith('<?xml '):
            result = await parse_xml(durl, resp)
        elif resp['parser'] == 'feed-rss':
            result = await parse_xml(durl, resp, rss=True)
        elif resp['parser'] == 'feed-atom':
            result = await parse_xml(durl, resp, atom=True)
        elif resp['parser'] == 'xml':
            result = await parse_xml(durl, resp)
        elif resp['parser'] == 'html':
            result = await parse_html(durl, resp, site)
        elif resp['parser'] in ('json', 'feed-json'):
            result = await parse_json(durl, resp)
        elif resp['parser'] == 'plain':
            result = await parse_plaintext(durl, resp, site)
        elif resp['parser'] == 'application':
            if resp['headers'].get('content-type') == 'application/x-gzip':
                if in_recursion:
                    return None  # consider nested gzip an attack
                resp['content'] = gzip.decompress(resp['content'])
                return await self._parse(durl, site, resp, in_recursion=True)
            result = await parse_document(durl, resp, site)
        if isinstance(result, ResourceRedirect):
            redir_url = result.urls[-1]
            result = await self.fetch(
                redir_url,
                site=site,
                redirect_history=result.urls[:-1],
            )
        return result

    async def get_resp(
        self,
        durl: Durl,
        headers: dict = None,
        redirect_history: Optional[list[str]] = None,
    ) -> Optional[Union[ResourceError, dict]]:
        """
        Try to fetch a url returning a ResourceError or a dict with content.

        Optional *headers* will overwrite the :var:`default_headers`.

        If the response status is not 200, always return an ResourceError.

        If the content-type is not relevant (see blacklist_content_types),
        return None.

        The dict contains these keys+values:

          * 'parser': a hint on the parser to use for analyzing the content;
             one of 'html', 'plain', 'feed', 'xml', 'application'
          * 'content': bytes for type application, otherwise str
          * 'redirects': a list of URLs visited during HTTP redirection,
                         the last item is the final URL
          * 'headers': response headers
        """
        if redirect_history is None:
            redirect_history = []
        if len(redirect_history) >= MAX_REDIRECTS:
            return None
        headers_ = default_headers.copy()
        if headers:
            headers_.update(headers)
        try:
            async with self.session.get(
                durl.url(),
                headers=headers_,
                timeout=self.timeout,
            ) as resp:
                redirects = [durl.url()]
                if resp.history:
                    href = resp.history[-1].headers.get('location')
                    if not href or not (redurl := await Durl(href, base=durl)):
                        msg = 'Invalid URL after HTTP redirect'
                        return ResourceError(msg)
                    if in_blacklist(redurl.hostname):
                        src_url = (
                            redirect_history[0]
                            if redirect_history
                            else durl.url()
                        )
                        msg = (
                            f'Dropping URL {src_url}, since'
                            f' redirected to a blacklisted site'
                        )
                        logger.debug(msg)
                        return None
                    redirects = [str(r.url) for r in resp.history]
                    redirects.append(redurl.url())
                if join := set(redirect_history) & set(redirects):
                    msg = f'Cyclic redirect {join}'
                    return ResourceError(msg)
                if resp.status != 200:
                    msg = f'HTTP status {resp.status}'
                    return ResourceError(
                        msg, status=resp.status, headers=headers
                    )
                c_type = resp.headers.get('content-type', '').split(';')[0]
                if c_type in blacklist_content_types:
                    return None
                result: dict[str, Any] = {
                    'redirects': redirect_history + redirects,
                    'headers': resp.headers,
                }
                if c_type in text_content_types.keys():
                    try:  # catch decoding issues
                        content = await resp.text()
                    except:
                        body = await resp.read()
                        encoding = resp.charset or 'utf-8'
                        encoding = encoding.replace('CP-1250', 'cp1250')
                        content = body.decode(encoding, errors='replace')
                    result['content'] = content
                    result['parser'] = text_content_types[c_type]
                    return result
                elif c_type.startswith('application/'):
                    result['content'] = await resp.read()
                    result['parser'] = 'application'
                    return result
        except aiohttp.ClientError as error:
            # on certificate error try without tls
            if 'SSLCertVerificationError' in str(error):
                if durl.scheme == 'https':
                    url = durl.url()
                    durl.replace_scheme('http')
                    response = await self.get_resp(
                        durl=durl,
                        headers=headers,
                        redirect_history=redirect_history + [url],
                    )
                    if not isinstance(response, ResourceError):
                        return response
            msg = f'ClientError: {error}'
            return ResourceError(msg)
        except Exception as error:
            msg = f'Unknown error: {error}:\n{format_exc()}'
            logger.error(msg)
            return ResourceError(msg)
        return None


async def parse_xml(
    durl: Durl,
    response: dict,
    rss=False,
    atom=False,
) -> Optional[Union[MetaResource, ResourceError]]:
    """
    Parse XML content.

    In particular, parse sitemapindex, sitemap, RSS feed, atom feed.
    """
    try:
        xml = response['content']
        soup = BeautifulSoup(xml, 'html.parser')
    except:
        return None
    if rss or (rss := soup.find('rss')):
        return parse_xml_feed(response)
    elif atom or (atom := soup.find('atom')):
        return parse_xml_feed(response)
    elif sitemapindex := soup.find('sitemapindex'):
        return parse_sitemapindex(sitemapindex)
    elif urlset := soup.find('urlset'):
        return parse_sitemap(urlset)
    else:
        return None


async def parse_json(
    durl: Durl,
    response: dict,
) -> Optional[Union[Feed, ResourceError]]:
    """
    Parse the content of JSON feeds.
    """
    try:
        data = loads(response['content'])
    except:
        msg = f'Could not parse JSON from {durl.url()}'
        logger.debug(msg)
        return None
    if not isinstance(data, dict):
        return None
    if data.get('version', '').startswith('https://jsonfeed.org/'):
        return parse_json_feed(response, data)
    return None