atextcrawler/src/atextcrawler/utils/muse.py

121 lines
3.4 KiB
Python

"""
Parse muse-formatted plaintext (delivered by amusewiki).
"""
import re
from datetime import datetime
from typing import Optional
from .date_finder import extract_latest_date
from .lang import clean_lang
re_tag = re.compile(r'<[^<]+?>')
def parse_muse(text: str) -> Optional[tuple[dict, str]]:
"""
Parse a MUSE string returning meta information and the text body.
"""
head, body = split_head_body(text)
if not head:
return None
meta = parse_head(head)
if not meta:
return None
return extract_muse_meta(meta, body), body
def split_head_body(text: str) -> tuple[str, str]:
"""
Split a MUSE string into head and body and return both.
"""
head = ''
while text.startswith('#'):
line_end = text.find('\n') + 1
head += text[:line_end]
text = text[line_end:]
return head.strip(), text.strip()
def parse_head(text: str) -> dict:
"""
Parse a MUSE head and return a dict mapping field names to values.
"""
fields = {}
for line in text.split('\n'):
name, value = line.strip().split(' ', 1)
fields[name[1:]] = value
return fields
amusewiki_fields = [
'author',
'title',
'lang',
'LISTtitle', # reduced title for alphabetical sorting
'subtitle',
'SORTauthors', # authors separated by ';' or ',' (only for indexing)
'SORTtopics', # topics separated by ';' or ',' (only for indexing)
'date', # publication year
'pubdate', # publication datetime
'notes', # additional info (orig title, translators, credits, ...)
'source', # preferred format: "Retrieved on March 8, 2012 from {URL}"
'publisher',
'isbn',
#'rights',
'seriesname',
'seriesnumber',
#'hyphenation', # irrelevant
#'slides', # irrelevant
#'DELETED', # irrelevant
#'cover', # irrelevant
#'coverwidth', # irrelevant
#'nocoverpage', # irrelevant
#'notoc', # irrelevant
#'nofinalpage', # irrelevant
#'impressum', # irrelevant
#'continuefootnotes', # irrelevant
#'centerchapter', # irrelevant
#'centersection', # irrelevant
]
"""
Amusewiki fields are (cf. https://amusewiki.org/library/manual)
"""
re_list = re.compile('[;,]')
def extract_muse_meta(meta, body) -> dict:
"""
Extract meta information from muse header and muse body.
"""
authors = set()
if author := meta.get('author', '').strip():
authors.add(author)
if sortauthors := meta.get('SORTauthors', '').strip():
for author in re_list.split(sortauthors):
if author_ := author.strip():
authors.add(author_)
pubdate = meta.get('pubdate').strip()
pub_date: Optional[datetime] = None
if pubdate:
try:
pub_date = datetime.fromisoformat(pubdate)
except:
pub_date = extract_latest_date(pubdate)
summary = re_tag.sub('', body[:1000].split('\n\n')[0])
return {
'title': re_tag.sub('', meta.get('title', '')) or None,
'authors': authors,
'lang': clean_lang(meta.get('lang')),
'keywords': [
s.strip()
for s in re_list.split(meta.get('SORTtopics', '').strip())
if s.strip()
],
'pub_date': pub_date,
'summary': summary,
'orig_source': meta.get('source', '').strip() or None,
}