atextcrawler/src/atextcrawler/resource/__main__.py

97 lines
3.1 KiB
Python

"""
Dev tool for fetching and displaying a resource.
Has no permanent effects.
"""
import asyncio
import logging
import sys
from collections import defaultdict
from pprint import pformat
import aiohttp
from ..models import Feed, TextResource
from ..resource import ResourceFetcher
from ..utils.annotation import pack_annotations, unpack_annotations
from ..utils.durl import Durl
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())
logger_page_debug = logging.getLogger('atextcrawler.resource.page.debug')
logger_page_debug.setLevel(logging.DEBUG)
def add_tags(text, annotations):
"""
Reconstruct html from text and annotations.
This is very similar to what the client does when displaying
a cached hit.
"""
html = ''
opening_tags = defaultdict(list)
closing_tags = defaultdict(list)
anns_tags = sorted(
annotations['tags'].items(), key=lambda x: (x[0][0], -x[0][1])
)
for (i, f), anns in anns_tags:
opening_tags[i] += [tag for tag in reversed(anns)]
closing_tags[f] += [tag for tag in reversed(anns)]
positions = sorted(set(opening_tags.keys()) | set(closing_tags.keys()))
last_pos = 0
links = {i: href for href, (i, f, rel) in annotations['links'].items()}
for pos in positions:
html += text[last_pos:pos]
closing = closing_tags.get(pos, [])
opening = opening_tags.get(pos, [])
common = set(closing) & set(opening)
closing = [tag for tag in closing if tag not in common]
opening = [tag for tag in opening if tag not in common]
tags_html = ''
for tag in reversed(closing):
html += f'</{tag}>\n'
for tag in opening:
if tag == 'a':
href = links.get(pos, '#')
html += f'<a href="{href}">'
else:
html += f'<{tag}>'
last_pos = pos
return html
async def run():
"""
Fetch and display a resource with URL given as cmdline argument.
"""
url = sys.argv[1]
async with aiohttp.ClientSession() as session:
if not (durl := await Durl(url)):
return
fetcher = ResourceFetcher(session)
resource = await fetcher.fetch(url)
if isinstance(resource, TextResource):
logger.warning(repr(resource))
logger.warning(f'Language: {resource.lang}')
logger.warning(pformat(resource.search_fields))
logger.warning(pformat(resource.init_fields))
# annotations = resource.search_fields.get('annotations')
# text = resource.search_fields['text']
# with open('/tmp/1.html', 'w') as f:
# html = add_tags(text, annotations)
# f.write(f'<html lang="de">\n<head><title>hhh</title></head>'
# f'<body>\n{html}\n</body></html>')
elif isinstance(resource, Feed):
logger.warning(resource.debug())
else:
logger.warning(f'Resource has type {type(resource)}')
logger.warning(resource)
if __name__ == '__main__':
asyncio.run(run())