atextcrawler/src/atextcrawler/site/robots.py

"""
Fetch and evaluate a website's robots.txt.
"""

import logging
from typing import Optional, Union
from urllib.robotparser import RobotFileParser

import aiohttp

logger = logging.getLogger(__name__)


class RobotsInfo(RobotFileParser):
    """
    Obtain information from a site's robots.txt.

    After instantiation you must await :meth:`startup`.
    """

    def __init__(
        self,
        site_url: str,
        user_agent: str = '*',
        session: aiohttp.ClientSession = None,
    ):
        super().__init__()
        self.__user_agent = user_agent
        self.__site_url = site_url.rstrip('/')
        self.__robots_url = self.__site_url + '/robots.txt'
        self.__timeout = aiohttp.ClientTimeout(sock_connect=2, sock_read=3)
        self.__session = session

    def __await__(self):
        return self.__ainit__().__await__()

    async def __ainit__(self):
        if self.__session:
            content = await self.__get_robots_txt(self.__session)
        else:
            async with aiohttp.ClientSession() as session:
                content = await self.__get_robots_txt(session)
        self.parse(content.splitlines())
        self.__delay = self.crawl_delay(self.__user_agent)
        request_rate = self.request_rate(self.__user_agent)
        if request_rate:
            self.__delay = request_rate.seconds / request_rate.requests
        self.__site_maps = super().site_maps() or []
        return self

    async def __get_robots_txt(self, session: aiohttp.ClientSession) -> str:
        """
        Fetch and return the robots.txt over http.
        """
        try:
            async with session.get(
                self.__robots_url, timeout=self.__timeout
            ) as resp:
                if resp.status == 200:
                    try:
                        content = await resp.text()
                    except:
                        body = await resp.read()
                        content = body.decode(
                            resp.charset or 'utf-8', errors='ignore'
                        )
                else:
                    content = ''
        except aiohttp.ClientError:
            content = ''
        return content

    @property
    def user_agent(self) -> str:
        """
        The user agent being used.
        """
        return self.__user_agent

    @property
    def delay(self) -> Optional[Union[int, float]]:
        """
        The delay to be used between requests.
        """
        return self.__delay

    @property
    def site_maps(self) -> list[str]:
        """
        The list of sitemaps of the site.
        """
        return self.__site_maps

    def can_fetch_url(self, url: str) -> bool:
        """
        Return whether fetching of the given *url* is allowed.
        """
        return super().can_fetch(self.__user_agent, url)