69 lines
2.2 KiB
Python
69 lines
2.2 KiB
Python
"""
|
|
Tool for analyzing a website.
|
|
|
|
Fetch the startpage and output information to console.
|
|
Do not change any persistent data.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import sys
|
|
|
|
import aiohttp
|
|
|
|
from ..models import TextResource
|
|
from ..resource import ResourceFetcher, extract_sitemap_paths, get_sitemap_urls
|
|
from ..site.robots import RobotsInfo
|
|
from ..utils.durl import Durl
|
|
from .parse import parse_startpage
|
|
|
|
logger = logging.getLogger()
|
|
logger.setLevel(logging.WARNING)
|
|
logger.addHandler(logging.StreamHandler())
|
|
|
|
|
|
async def run():
|
|
"""
|
|
Fetch the startpage of a website and show information about it.
|
|
|
|
The URL must be given as commandline argument.
|
|
"""
|
|
base_url = sys.argv[1]
|
|
async with aiohttp.ClientSession() as session:
|
|
if not (base_durl := await Durl(base_url)):
|
|
return
|
|
fetcher = ResourceFetcher(session)
|
|
resource = await fetcher.fetch(base_url)
|
|
logger.warning(repr(resource))
|
|
if (
|
|
isinstance(resource, TextResource)
|
|
and resource.content_type == 'html'
|
|
):
|
|
site = await parse_startpage(resource)
|
|
# site.crawl_enabled = await site_filter(site)
|
|
logger.warning(repr(site))
|
|
logger.warning('')
|
|
for durl, text in site.links_ext.items():
|
|
logger.warning(f' {durl} {text}')
|
|
logger.warning(f'{durl.url()} -------- {text}')
|
|
logger.warning('')
|
|
logger.warning(f'Redirects: {resource.init_fields["redirects"]}')
|
|
logger.warning('')
|
|
robots = await RobotsInfo(base_url)
|
|
urls = await get_sitemap_urls(
|
|
fetcher, base_url, sitemaps=robots.site_maps
|
|
)
|
|
paths, latest = extract_sitemap_paths(base_url, urls)
|
|
for path in paths:
|
|
logger.warning(path)
|
|
logger.warning(f'Feeds: {site.feeds}')
|
|
logger.warning(latest)
|
|
# sample_links = extract_samples(resource.init_fields['links_int'])
|
|
# logger.warning(f'************* {sample_links}')
|
|
else:
|
|
logger.warning('(No text resource or error.)')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(run())
|