atextcrawler/src/atextcrawler/resource/__main__.py

"""
Dev tool for fetching and displaying a resource.

Has no permanent effects.
"""

import asyncio
import logging
import sys
from collections import defaultdict
from pprint import pformat

import aiohttp

from ..models import Feed, TextResource
from ..resource import ResourceFetcher
from ..utils.annotation import pack_annotations, unpack_annotations
from ..utils.durl import Durl

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())
logger_page_debug = logging.getLogger('atextcrawler.resource.page.debug')
logger_page_debug.setLevel(logging.DEBUG)


def add_tags(text, annotations):
    """
    Reconstruct html from text and annotations.

    This is very similar to what the client does when displaying
    a cached hit.
    """
    html = ''
    opening_tags = defaultdict(list)
    closing_tags = defaultdict(list)
    anns_tags = sorted(
        annotations['tags'].items(), key=lambda x: (x[0][0], -x[0][1])
    )
    for (i, f), anns in anns_tags:
        opening_tags[i] += [tag for tag in reversed(anns)]
        closing_tags[f] += [tag for tag in reversed(anns)]
    positions = sorted(set(opening_tags.keys()) | set(closing_tags.keys()))
    last_pos = 0
    links = {i: href for href, (i, f, rel) in annotations['links'].items()}
    for pos in positions:
        html += text[last_pos:pos]
        closing = closing_tags.get(pos, [])
        opening = opening_tags.get(pos, [])
        common = set(closing) & set(opening)
        closing = [tag for tag in closing if tag not in common]
        opening = [tag for tag in opening if tag not in common]
        tags_html = ''
        for tag in reversed(closing):
            html += f'</{tag}>\n'
        for tag in opening:
            if tag == 'a':
                href = links.get(pos, '#')
                html += f'<a href="{href}">'
            else:
                html += f'<{tag}>'
        last_pos = pos
    return html


async def run():
    """
    Fetch and display a resource with URL given as cmdline argument.
    """
    url = sys.argv[1]
    async with aiohttp.ClientSession() as session:
        if not (durl := await Durl(url)):
            return
        fetcher = ResourceFetcher(session)
        resource = await fetcher.fetch(url)
        if isinstance(resource, TextResource):
            logger.warning(repr(resource))
            logger.warning(f'Language: {resource.lang}')
            logger.warning(pformat(resource.search_fields))
            logger.warning(pformat(resource.init_fields))

            # annotations = resource.search_fields.get('annotations')
            # text = resource.search_fields['text']
            # with open('/tmp/1.html', 'w') as f:
            #    html = add_tags(text, annotations)
            #    f.write(f'<html lang="de">\n<head><title>hhh</title></head>'
            #            f'<body>\n{html}\n</body></html>')
        elif isinstance(resource, Feed):
            logger.warning(resource.debug())
        else:
            logger.warning(f'Resource has type {type(resource)}')
            logger.warning(resource)


if __name__ == '__main__':
    asyncio.run(run())