atextcrawler/src/atextcrawler/utils/muse.py

"""
Parse muse-formatted plaintext (delivered by amusewiki).
"""

import re
from datetime import datetime
from typing import Optional

from .date_finder import extract_latest_date
from .lang import clean_lang

re_tag = re.compile(r'<[^<]+?>')


def parse_muse(text: str) -> Optional[tuple[dict, str]]:
    """
    Parse a MUSE string returning meta information and the text body.
    """
    head, body = split_head_body(text)
    if not head:
        return None
    meta = parse_head(head)
    if not meta:
        return None
    return extract_muse_meta(meta, body), body


def split_head_body(text: str) -> tuple[str, str]:
    """
    Split a MUSE string into head and body and return both.
    """
    head = ''
    while text.startswith('#'):
        line_end = text.find('\n') + 1
        head += text[:line_end]
        text = text[line_end:]
    return head.strip(), text.strip()


def parse_head(text: str) -> dict:
    """
    Parse a MUSE head and return a dict mapping field names to values.
    """
    fields = {}
    for line in text.split('\n'):
        name, value = line.strip().split(' ', 1)
        fields[name[1:]] = value
    return fields


amusewiki_fields = [
    'author',
    'title',
    'lang',
    'LISTtitle',  # reduced title for alphabetical sorting
    'subtitle',
    'SORTauthors',  # authors separated by ';' or ',' (only for indexing)
    'SORTtopics',  # topics separated by ';' or ',' (only for indexing)
    'date',  # publication year
    'pubdate',  # publication datetime
    'notes',  # additional info (orig title, translators, credits, ...)
    'source',  # preferred format: "Retrieved on March 8, 2012 from {URL}"
    'publisher',
    'isbn',
    #'rights',
    'seriesname',
    'seriesnumber',
    #'hyphenation',       # irrelevant
    #'slides',            # irrelevant
    #'DELETED',           # irrelevant
    #'cover',             # irrelevant
    #'coverwidth',        # irrelevant
    #'nocoverpage',       # irrelevant
    #'notoc',             # irrelevant
    #'nofinalpage',       # irrelevant
    #'impressum',         # irrelevant
    #'continuefootnotes', # irrelevant
    #'centerchapter',     # irrelevant
    #'centersection',     # irrelevant
]
"""
Amusewiki fields are (cf. https://amusewiki.org/library/manual)
"""


re_list = re.compile('[;,]')


def extract_muse_meta(meta, body) -> dict:
    """
    Extract meta information from muse header and muse body.
    """
    authors = set()
    if author := meta.get('author', '').strip():
        authors.add(author)
    if sortauthors := meta.get('SORTauthors', '').strip():
        for author in re_list.split(sortauthors):
            if author_ := author.strip():
                authors.add(author_)
    pubdate = meta.get('pubdate').strip()
    pub_date: Optional[datetime] = None
    if pubdate:
        try:
            pub_date = datetime.fromisoformat(pubdate)
        except:
            pub_date = extract_latest_date(pubdate)
    summary = re_tag.sub('', body[:1000].split('\n\n')[0])
    return {
        'title': re_tag.sub('', meta.get('title', '')) or None,
        'authors': authors,
        'lang': clean_lang(meta.get('lang')),
        'keywords': [
            s.strip()
            for s in re_list.split(meta.get('SORTtopics', '').strip())
            if s.strip()
        ],
        'pub_date': pub_date,
        'summary': summary,
        'orig_source': meta.get('source', '').strip() or None,
    }