99 lines
2.8 KiB
Python
99 lines
2.8 KiB
Python
"""
|
|
Fetch and evaluate a website's robots.txt.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Optional, Union
|
|
from urllib.robotparser import RobotFileParser
|
|
|
|
import aiohttp
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class RobotsInfo(RobotFileParser):
|
|
"""
|
|
Obtain information from a site's robots.txt.
|
|
|
|
After instantiation you must await :meth:`startup`.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
site_url: str,
|
|
user_agent: str = '*',
|
|
session: aiohttp.ClientSession = None,
|
|
):
|
|
super().__init__()
|
|
self.__user_agent = user_agent
|
|
self.__site_url = site_url.rstrip('/')
|
|
self.__robots_url = self.__site_url + '/robots.txt'
|
|
self.__timeout = aiohttp.ClientTimeout(sock_connect=2, sock_read=3)
|
|
self.__session = session
|
|
|
|
def __await__(self):
|
|
return self.__ainit__().__await__()
|
|
|
|
async def __ainit__(self):
|
|
if self.__session:
|
|
content = await self.__get_robots_txt(self.__session)
|
|
else:
|
|
async with aiohttp.ClientSession() as session:
|
|
content = await self.__get_robots_txt(session)
|
|
self.parse(content.splitlines())
|
|
self.__delay = self.crawl_delay(self.__user_agent)
|
|
request_rate = self.request_rate(self.__user_agent)
|
|
if request_rate:
|
|
self.__delay = request_rate.seconds / request_rate.requests
|
|
self.__site_maps = super().site_maps() or []
|
|
return self
|
|
|
|
async def __get_robots_txt(self, session: aiohttp.ClientSession) -> str:
|
|
"""
|
|
Fetch and return the robots.txt over http.
|
|
"""
|
|
try:
|
|
async with session.get(
|
|
self.__robots_url, timeout=self.__timeout
|
|
) as resp:
|
|
if resp.status == 200:
|
|
try:
|
|
content = await resp.text()
|
|
except:
|
|
body = await resp.read()
|
|
content = body.decode(
|
|
resp.charset or 'utf-8', errors='ignore'
|
|
)
|
|
else:
|
|
content = ''
|
|
except aiohttp.ClientError:
|
|
content = ''
|
|
return content
|
|
|
|
@property
|
|
def user_agent(self) -> str:
|
|
"""
|
|
The user agent being used.
|
|
"""
|
|
return self.__user_agent
|
|
|
|
@property
|
|
def delay(self) -> Optional[Union[int, float]]:
|
|
"""
|
|
The delay to be used between requests.
|
|
"""
|
|
return self.__delay
|
|
|
|
@property
|
|
def site_maps(self) -> list[str]:
|
|
"""
|
|
The list of sitemaps of the site.
|
|
"""
|
|
return self.__site_maps
|
|
|
|
def can_fetch_url(self, url: str) -> bool:
|
|
"""
|
|
Return whether fetching of the given *url* is allowed.
|
|
"""
|
|
return super().can_fetch(self.__user_agent, url)
|