97 lines
3.1 KiB
Python
97 lines
3.1 KiB
Python
"""
|
|
Dev tool for fetching and displaying a resource.
|
|
|
|
Has no permanent effects.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import sys
|
|
from collections import defaultdict
|
|
from pprint import pformat
|
|
|
|
import aiohttp
|
|
|
|
from ..models import Feed, TextResource
|
|
from ..resource import ResourceFetcher
|
|
from ..utils.annotation import pack_annotations, unpack_annotations
|
|
from ..utils.durl import Durl
|
|
|
|
logger = logging.getLogger()
|
|
logger.setLevel(logging.DEBUG)
|
|
logger.addHandler(logging.StreamHandler())
|
|
logger_page_debug = logging.getLogger('atextcrawler.resource.page.debug')
|
|
logger_page_debug.setLevel(logging.DEBUG)
|
|
|
|
|
|
def add_tags(text, annotations):
|
|
"""
|
|
Reconstruct html from text and annotations.
|
|
|
|
This is very similar to what the client does when displaying
|
|
a cached hit.
|
|
"""
|
|
html = ''
|
|
opening_tags = defaultdict(list)
|
|
closing_tags = defaultdict(list)
|
|
anns_tags = sorted(
|
|
annotations['tags'].items(), key=lambda x: (x[0][0], -x[0][1])
|
|
)
|
|
for (i, f), anns in anns_tags:
|
|
opening_tags[i] += [tag for tag in reversed(anns)]
|
|
closing_tags[f] += [tag for tag in reversed(anns)]
|
|
positions = sorted(set(opening_tags.keys()) | set(closing_tags.keys()))
|
|
last_pos = 0
|
|
links = {i: href for href, (i, f, rel) in annotations['links'].items()}
|
|
for pos in positions:
|
|
html += text[last_pos:pos]
|
|
closing = closing_tags.get(pos, [])
|
|
opening = opening_tags.get(pos, [])
|
|
common = set(closing) & set(opening)
|
|
closing = [tag for tag in closing if tag not in common]
|
|
opening = [tag for tag in opening if tag not in common]
|
|
tags_html = ''
|
|
for tag in reversed(closing):
|
|
html += f'</{tag}>\n'
|
|
for tag in opening:
|
|
if tag == 'a':
|
|
href = links.get(pos, '#')
|
|
html += f'<a href="{href}">'
|
|
else:
|
|
html += f'<{tag}>'
|
|
last_pos = pos
|
|
return html
|
|
|
|
|
|
async def run():
|
|
"""
|
|
Fetch and display a resource with URL given as cmdline argument.
|
|
"""
|
|
url = sys.argv[1]
|
|
async with aiohttp.ClientSession() as session:
|
|
if not (durl := await Durl(url)):
|
|
return
|
|
fetcher = ResourceFetcher(session)
|
|
resource = await fetcher.fetch(url)
|
|
if isinstance(resource, TextResource):
|
|
logger.warning(repr(resource))
|
|
logger.warning(f'Language: {resource.lang}')
|
|
logger.warning(pformat(resource.search_fields))
|
|
logger.warning(pformat(resource.init_fields))
|
|
|
|
# annotations = resource.search_fields.get('annotations')
|
|
# text = resource.search_fields['text']
|
|
# with open('/tmp/1.html', 'w') as f:
|
|
# html = add_tags(text, annotations)
|
|
# f.write(f'<html lang="de">\n<head><title>hhh</title></head>'
|
|
# f'<body>\n{html}\n</body></html>')
|
|
elif isinstance(resource, Feed):
|
|
logger.warning(resource.debug())
|
|
else:
|
|
logger.warning(f'Resource has type {type(resource)}')
|
|
logger.warning(resource)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(run())
|