121 lines
3.4 KiB
Python
121 lines
3.4 KiB
Python
"""
|
|
Parse muse-formatted plaintext (delivered by amusewiki).
|
|
"""
|
|
|
|
import re
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
from .date_finder import extract_latest_date
|
|
from .lang import clean_lang
|
|
|
|
re_tag = re.compile(r'<[^<]+?>')
|
|
|
|
|
|
def parse_muse(text: str) -> Optional[tuple[dict, str]]:
|
|
"""
|
|
Parse a MUSE string returning meta information and the text body.
|
|
"""
|
|
head, body = split_head_body(text)
|
|
if not head:
|
|
return None
|
|
meta = parse_head(head)
|
|
if not meta:
|
|
return None
|
|
return extract_muse_meta(meta, body), body
|
|
|
|
|
|
def split_head_body(text: str) -> tuple[str, str]:
|
|
"""
|
|
Split a MUSE string into head and body and return both.
|
|
"""
|
|
head = ''
|
|
while text.startswith('#'):
|
|
line_end = text.find('\n') + 1
|
|
head += text[:line_end]
|
|
text = text[line_end:]
|
|
return head.strip(), text.strip()
|
|
|
|
|
|
def parse_head(text: str) -> dict:
|
|
"""
|
|
Parse a MUSE head and return a dict mapping field names to values.
|
|
"""
|
|
fields = {}
|
|
for line in text.split('\n'):
|
|
name, value = line.strip().split(' ', 1)
|
|
fields[name[1:]] = value
|
|
return fields
|
|
|
|
|
|
amusewiki_fields = [
|
|
'author',
|
|
'title',
|
|
'lang',
|
|
'LISTtitle', # reduced title for alphabetical sorting
|
|
'subtitle',
|
|
'SORTauthors', # authors separated by ';' or ',' (only for indexing)
|
|
'SORTtopics', # topics separated by ';' or ',' (only for indexing)
|
|
'date', # publication year
|
|
'pubdate', # publication datetime
|
|
'notes', # additional info (orig title, translators, credits, ...)
|
|
'source', # preferred format: "Retrieved on March 8, 2012 from {URL}"
|
|
'publisher',
|
|
'isbn',
|
|
#'rights',
|
|
'seriesname',
|
|
'seriesnumber',
|
|
#'hyphenation', # irrelevant
|
|
#'slides', # irrelevant
|
|
#'DELETED', # irrelevant
|
|
#'cover', # irrelevant
|
|
#'coverwidth', # irrelevant
|
|
#'nocoverpage', # irrelevant
|
|
#'notoc', # irrelevant
|
|
#'nofinalpage', # irrelevant
|
|
#'impressum', # irrelevant
|
|
#'continuefootnotes', # irrelevant
|
|
#'centerchapter', # irrelevant
|
|
#'centersection', # irrelevant
|
|
]
|
|
"""
|
|
Amusewiki fields are (cf. https://amusewiki.org/library/manual)
|
|
"""
|
|
|
|
|
|
re_list = re.compile('[;,]')
|
|
|
|
|
|
def extract_muse_meta(meta, body) -> dict:
|
|
"""
|
|
Extract meta information from muse header and muse body.
|
|
"""
|
|
authors = set()
|
|
if author := meta.get('author', '').strip():
|
|
authors.add(author)
|
|
if sortauthors := meta.get('SORTauthors', '').strip():
|
|
for author in re_list.split(sortauthors):
|
|
if author_ := author.strip():
|
|
authors.add(author_)
|
|
pubdate = meta.get('pubdate').strip()
|
|
pub_date: Optional[datetime] = None
|
|
if pubdate:
|
|
try:
|
|
pub_date = datetime.fromisoformat(pubdate)
|
|
except:
|
|
pub_date = extract_latest_date(pubdate)
|
|
summary = re_tag.sub('', body[:1000].split('\n\n')[0])
|
|
return {
|
|
'title': re_tag.sub('', meta.get('title', '')) or None,
|
|
'authors': authors,
|
|
'lang': clean_lang(meta.get('lang')),
|
|
'keywords': [
|
|
s.strip()
|
|
for s in re_list.split(meta.get('SORTtopics', '').strip())
|
|
if s.strip()
|
|
],
|
|
'pub_date': pub_date,
|
|
'summary': summary,
|
|
'orig_source': meta.get('source', '').strip() or None,
|
|
}
|