atextcrawler/src/atextcrawler/site/__main__.py

69 lines
2.2 KiB
Python

"""
Tool for analyzing a website.
Fetch the startpage and output information to console.
Do not change any persistent data.
"""
import asyncio
import logging
import sys
import aiohttp
from ..models import TextResource
from ..resource import ResourceFetcher, extract_sitemap_paths, get_sitemap_urls
from ..site.robots import RobotsInfo
from ..utils.durl import Durl
from .parse import parse_startpage
logger = logging.getLogger()
logger.setLevel(logging.WARNING)
logger.addHandler(logging.StreamHandler())
async def run():
"""
Fetch the startpage of a website and show information about it.
The URL must be given as commandline argument.
"""
base_url = sys.argv[1]
async with aiohttp.ClientSession() as session:
if not (base_durl := await Durl(base_url)):
return
fetcher = ResourceFetcher(session)
resource = await fetcher.fetch(base_url)
logger.warning(repr(resource))
if (
isinstance(resource, TextResource)
and resource.content_type == 'html'
):
site = await parse_startpage(resource)
# site.crawl_enabled = await site_filter(site)
logger.warning(repr(site))
logger.warning('')
for durl, text in site.links_ext.items():
logger.warning(f' {durl} {text}')
logger.warning(f'{durl.url()} -------- {text}')
logger.warning('')
logger.warning(f'Redirects: {resource.init_fields["redirects"]}')
logger.warning('')
robots = await RobotsInfo(base_url)
urls = await get_sitemap_urls(
fetcher, base_url, sitemaps=robots.site_maps
)
paths, latest = extract_sitemap_paths(base_url, urls)
for path in paths:
logger.warning(path)
logger.warning(f'Feeds: {site.feeds}')
logger.warning(latest)
# sample_links = extract_samples(resource.init_fields['links_int'])
# logger.warning(f'************* {sample_links}')
else:
logger.warning('(No text resource or error.)')
if __name__ == '__main__':
asyncio.run(run())