atextcrawler/src/atextcrawler/resource/plaintext.py

"""
Parse plaintext pages.
"""

import logging
import re
from typing import Any, Optional, Union

import pypandoc

from ..models import ResourceError, ResourceRedirect, Site, TextResource
from ..utils.annotation import annotate
from ..utils.date_finder import extract_latest_date
from ..utils.durl import Durl
from ..utils.http import get_header_links
from ..utils.lang import extract_content_language
from ..utils.muse import parse_muse

logger = logging.getLogger(__name__)


MAX_LINK_TEXT_LENGTH = 100
"""
Maximum length of a link's text to be kept.

Cf. table site_link, column link_text.
"""


re_url = re.compile(
    r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
    r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
)


re_nl = re.compile(r'\r\n')


re_ws = re.compile(r'\s*\n\s*\n\s*')


re_nn = re.compile(r'\n\n')


async def parse_plaintext(
    durl: Durl,
    resp: dict,
    site: Optional[Site],
) -> Optional[Union[ResourceRedirect, TextResource]]:
    """
    Extract relevant data from a response returning a TextResource instance.

    The given URL must be the full URL (incl. scheme and netloc) of the page.
    """
    text = resp['content']

    # HTTP headers, canonical URL, shortlink
    header_links = await get_header_links(resp['headers'], durl, site)
    if canonical := header_links.get('canonical'):
        if canonical != durl.url():
            return ResourceRedirect(resp['redirects'] + [canonical])
    shortlink = header_links.get('shortlink')

    if not text:
        return None

    text = re_nl.sub('\n', text)
    text = re_ws.sub('\n\n', text)

    # meta info
    meta: dict[str, Any] = {}
    muse = None
    if durl.path.endswith('.muse'):
        muse = parse_muse(text)
        if muse:
            meta, text = muse
    # title
    if not meta.get('title'):
        meta['title'] = text[:200].splitlines()[0]
    # content language
    if not meta.get('lang'):
        meta['lang'] = extract_content_language(text)
    # publication date
    if not meta.get('pub_date'):
        meta['pub_date'] = extract_latest_date(text, lang=meta.get('lang'))

    # links
    links_int: dict[Durl, tuple[list[str], str]] = {}
    links_ext: dict[Durl, tuple[list[str], str]] = {}
    for url in re_url.findall(text):
        link_durl = await Durl(url[0])
        if link_durl:
            if link_durl.site() == durl.site():
                links_int[link_durl] = [], link_durl.url()
            else:
                links_ext[link_durl] = [], link_durl.url()

    if muse:
        html = pypandoc.convert_text(text, 'html5', format='muse').strip()
        text, annotations = annotate(html)
    else:
        text, annotations = annotate_text(text)

    return TextResource(
        content_type=resp['parser'],
        last_change=meta.get('pub_date'),
        text_len=len(text),
        lang=meta.get('lang'),
        title=meta.get('title'),
        init_fields={
            'durl': durl,
            'site': site,
            'headers': resp['headers'],
            'redirects': resp['redirects'],
            'links_int': links_int,
            'links_ext': links_ext,
            'shortlink': shortlink,
            'canonical': None,
        },
        search_fields={
            'title': meta.get('title'),
            'authors': meta.get('authors'),
            'pub_date': meta.get('pub_date'),
            'keywords': meta.get('keywords'),
            'summary': meta.get('summary'),
            'text': text,
            'annotations': annotations,
        },
    )


def annotate_text(text):
    """
    Return annoations as :func:`utils.annotation.annotate`does.

    Here we only have information on semantic breaks
    (in plaintext they are where empty lines are).
    """
    semantic_breaks = {}
    for match in re_nn.finditer(text):
        semantic_breaks[match.span()[0]] = ''
    annotations = {
        'tags': {},
        'semantic_breaks': semantic_breaks,
        'section_ids': {},
        'links': {},
    }
    return text, annotations