atextcrawler/src/atextcrawler/resource/plaintext.py

149 lines
3.9 KiB
Python

"""
Parse plaintext pages.
"""
import logging
import re
from typing import Any, Optional, Union
import pypandoc
from ..models import ResourceError, ResourceRedirect, Site, TextResource
from ..utils.annotation import annotate
from ..utils.date_finder import extract_latest_date
from ..utils.durl import Durl
from ..utils.http import get_header_links
from ..utils.lang import extract_content_language
from ..utils.muse import parse_muse
logger = logging.getLogger(__name__)
MAX_LINK_TEXT_LENGTH = 100
"""
Maximum length of a link's text to be kept.
Cf. table site_link, column link_text.
"""
re_url = re.compile(
r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
)
re_nl = re.compile(r'\r\n')
re_ws = re.compile(r'\s*\n\s*\n\s*')
re_nn = re.compile(r'\n\n')
async def parse_plaintext(
durl: Durl,
resp: dict,
site: Optional[Site],
) -> Optional[Union[ResourceRedirect, TextResource]]:
"""
Extract relevant data from a response returning a TextResource instance.
The given URL must be the full URL (incl. scheme and netloc) of the page.
"""
text = resp['content']
# HTTP headers, canonical URL, shortlink
header_links = await get_header_links(resp['headers'], durl, site)
if canonical := header_links.get('canonical'):
if canonical != durl.url():
return ResourceRedirect(resp['redirects'] + [canonical])
shortlink = header_links.get('shortlink')
if not text:
return None
text = re_nl.sub('\n', text)
text = re_ws.sub('\n\n', text)
# meta info
meta: dict[str, Any] = {}
muse = None
if durl.path.endswith('.muse'):
muse = parse_muse(text)
if muse:
meta, text = muse
# title
if not meta.get('title'):
meta['title'] = text[:200].splitlines()[0]
# content language
if not meta.get('lang'):
meta['lang'] = extract_content_language(text)
# publication date
if not meta.get('pub_date'):
meta['pub_date'] = extract_latest_date(text, lang=meta.get('lang'))
# links
links_int: dict[Durl, tuple[list[str], str]] = {}
links_ext: dict[Durl, tuple[list[str], str]] = {}
for url in re_url.findall(text):
link_durl = await Durl(url[0])
if link_durl:
if link_durl.site() == durl.site():
links_int[link_durl] = [], link_durl.url()
else:
links_ext[link_durl] = [], link_durl.url()
if muse:
html = pypandoc.convert_text(text, 'html5', format='muse').strip()
text, annotations = annotate(html)
else:
text, annotations = annotate_text(text)
return TextResource(
content_type=resp['parser'],
last_change=meta.get('pub_date'),
text_len=len(text),
lang=meta.get('lang'),
title=meta.get('title'),
init_fields={
'durl': durl,
'site': site,
'headers': resp['headers'],
'redirects': resp['redirects'],
'links_int': links_int,
'links_ext': links_ext,
'shortlink': shortlink,
'canonical': None,
},
search_fields={
'title': meta.get('title'),
'authors': meta.get('authors'),
'pub_date': meta.get('pub_date'),
'keywords': meta.get('keywords'),
'summary': meta.get('summary'),
'text': text,
'annotations': annotations,
},
)
def annotate_text(text):
"""
Return annoations as :func:`utils.annotation.annotate`does.
Here we only have information on semantic breaks
(in plaintext they are where empty lines are).
"""
semantic_breaks = {}
for match in re_nn.finditer(text):
semantic_breaks[match.span()[0]] = ''
annotations = {
'tags': {},
'semantic_breaks': semantic_breaks,
'section_ids': {},
'links': {},
}
return text, annotations