149 lines
3.9 KiB
Python
149 lines
3.9 KiB
Python
"""
|
|
Parse plaintext pages.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any, Optional, Union
|
|
|
|
import pypandoc
|
|
|
|
from ..models import ResourceError, ResourceRedirect, Site, TextResource
|
|
from ..utils.annotation import annotate
|
|
from ..utils.date_finder import extract_latest_date
|
|
from ..utils.durl import Durl
|
|
from ..utils.http import get_header_links
|
|
from ..utils.lang import extract_content_language
|
|
from ..utils.muse import parse_muse
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
MAX_LINK_TEXT_LENGTH = 100
|
|
"""
|
|
Maximum length of a link's text to be kept.
|
|
|
|
Cf. table site_link, column link_text.
|
|
"""
|
|
|
|
|
|
re_url = re.compile(
|
|
r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
|
|
r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
|
|
)
|
|
|
|
|
|
re_nl = re.compile(r'\r\n')
|
|
|
|
|
|
re_ws = re.compile(r'\s*\n\s*\n\s*')
|
|
|
|
|
|
re_nn = re.compile(r'\n\n')
|
|
|
|
|
|
async def parse_plaintext(
|
|
durl: Durl,
|
|
resp: dict,
|
|
site: Optional[Site],
|
|
) -> Optional[Union[ResourceRedirect, TextResource]]:
|
|
"""
|
|
Extract relevant data from a response returning a TextResource instance.
|
|
|
|
The given URL must be the full URL (incl. scheme and netloc) of the page.
|
|
"""
|
|
text = resp['content']
|
|
|
|
# HTTP headers, canonical URL, shortlink
|
|
header_links = await get_header_links(resp['headers'], durl, site)
|
|
if canonical := header_links.get('canonical'):
|
|
if canonical != durl.url():
|
|
return ResourceRedirect(resp['redirects'] + [canonical])
|
|
shortlink = header_links.get('shortlink')
|
|
|
|
if not text:
|
|
return None
|
|
|
|
text = re_nl.sub('\n', text)
|
|
text = re_ws.sub('\n\n', text)
|
|
|
|
# meta info
|
|
meta: dict[str, Any] = {}
|
|
muse = None
|
|
if durl.path.endswith('.muse'):
|
|
muse = parse_muse(text)
|
|
if muse:
|
|
meta, text = muse
|
|
# title
|
|
if not meta.get('title'):
|
|
meta['title'] = text[:200].splitlines()[0]
|
|
# content language
|
|
if not meta.get('lang'):
|
|
meta['lang'] = extract_content_language(text)
|
|
# publication date
|
|
if not meta.get('pub_date'):
|
|
meta['pub_date'] = extract_latest_date(text, lang=meta.get('lang'))
|
|
|
|
# links
|
|
links_int: dict[Durl, tuple[list[str], str]] = {}
|
|
links_ext: dict[Durl, tuple[list[str], str]] = {}
|
|
for url in re_url.findall(text):
|
|
link_durl = await Durl(url[0])
|
|
if link_durl:
|
|
if link_durl.site() == durl.site():
|
|
links_int[link_durl] = [], link_durl.url()
|
|
else:
|
|
links_ext[link_durl] = [], link_durl.url()
|
|
|
|
if muse:
|
|
html = pypandoc.convert_text(text, 'html5', format='muse').strip()
|
|
text, annotations = annotate(html)
|
|
else:
|
|
text, annotations = annotate_text(text)
|
|
|
|
return TextResource(
|
|
content_type=resp['parser'],
|
|
last_change=meta.get('pub_date'),
|
|
text_len=len(text),
|
|
lang=meta.get('lang'),
|
|
title=meta.get('title'),
|
|
init_fields={
|
|
'durl': durl,
|
|
'site': site,
|
|
'headers': resp['headers'],
|
|
'redirects': resp['redirects'],
|
|
'links_int': links_int,
|
|
'links_ext': links_ext,
|
|
'shortlink': shortlink,
|
|
'canonical': None,
|
|
},
|
|
search_fields={
|
|
'title': meta.get('title'),
|
|
'authors': meta.get('authors'),
|
|
'pub_date': meta.get('pub_date'),
|
|
'keywords': meta.get('keywords'),
|
|
'summary': meta.get('summary'),
|
|
'text': text,
|
|
'annotations': annotations,
|
|
},
|
|
)
|
|
|
|
|
|
def annotate_text(text):
|
|
"""
|
|
Return annoations as :func:`utils.annotation.annotate`does.
|
|
|
|
Here we only have information on semantic breaks
|
|
(in plaintext they are where empty lines are).
|
|
"""
|
|
semantic_breaks = {}
|
|
for match in re_nn.finditer(text):
|
|
semantic_breaks[match.span()[0]] = ''
|
|
annotations = {
|
|
'tags': {},
|
|
'semantic_breaks': semantic_breaks,
|
|
'section_ids': {},
|
|
'links': {},
|
|
}
|
|
return text, annotations
|