""" Parse muse-formatted plaintext (delivered by amusewiki). """ import re from datetime import datetime from typing import Optional from .date_finder import extract_latest_date from .lang import clean_lang re_tag = re.compile(r'<[^<]+?>') def parse_muse(text: str) -> Optional[tuple[dict, str]]: """ Parse a MUSE string returning meta information and the text body. """ head, body = split_head_body(text) if not head: return None meta = parse_head(head) if not meta: return None return extract_muse_meta(meta, body), body def split_head_body(text: str) -> tuple[str, str]: """ Split a MUSE string into head and body and return both. """ head = '' while text.startswith('#'): line_end = text.find('\n') + 1 head += text[:line_end] text = text[line_end:] return head.strip(), text.strip() def parse_head(text: str) -> dict: """ Parse a MUSE head and return a dict mapping field names to values. """ fields = {} for line in text.split('\n'): name, value = line.strip().split(' ', 1) fields[name[1:]] = value return fields amusewiki_fields = [ 'author', 'title', 'lang', 'LISTtitle', # reduced title for alphabetical sorting 'subtitle', 'SORTauthors', # authors separated by ';' or ',' (only for indexing) 'SORTtopics', # topics separated by ';' or ',' (only for indexing) 'date', # publication year 'pubdate', # publication datetime 'notes', # additional info (orig title, translators, credits, ...) 'source', # preferred format: "Retrieved on March 8, 2012 from {URL}" 'publisher', 'isbn', #'rights', 'seriesname', 'seriesnumber', #'hyphenation', # irrelevant #'slides', # irrelevant #'DELETED', # irrelevant #'cover', # irrelevant #'coverwidth', # irrelevant #'nocoverpage', # irrelevant #'notoc', # irrelevant #'nofinalpage', # irrelevant #'impressum', # irrelevant #'continuefootnotes', # irrelevant #'centerchapter', # irrelevant #'centersection', # irrelevant ] """ Amusewiki fields are (cf. https://amusewiki.org/library/manual) """ re_list = re.compile('[;,]') def extract_muse_meta(meta, body) -> dict: """ Extract meta information from muse header and muse body. """ authors = set() if author := meta.get('author', '').strip(): authors.add(author) if sortauthors := meta.get('SORTauthors', '').strip(): for author in re_list.split(sortauthors): if author_ := author.strip(): authors.add(author_) pubdate = meta.get('pubdate').strip() pub_date: Optional[datetime] = None if pubdate: try: pub_date = datetime.fromisoformat(pubdate) except: pub_date = extract_latest_date(pubdate) summary = re_tag.sub('', body[:1000].split('\n\n')[0]) return { 'title': re_tag.sub('', meta.get('title', '')) or None, 'authors': authors, 'lang': clean_lang(meta.get('lang')), 'keywords': [ s.strip() for s in re_list.split(meta.get('SORTtopics', '').strip()) if s.strip() ], 'pub_date': pub_date, 'summary': summary, 'orig_source': meta.get('source', '').strip() or None, }