132 lines
3.6 KiB
Python
132 lines
3.6 KiB
Python
"""
|
|
Parse documents (often application/pdf).
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from datetime import datetime
|
|
from typing import Optional, Union
|
|
|
|
from tika import parser
|
|
|
|
from ..models import ResourceError, ResourceRedirect, Site, TextResource
|
|
from ..utils.durl import Durl
|
|
from ..utils.http import get_header_links
|
|
from ..utils.lang import extract_content_language
|
|
from .plaintext import annotate_text
|
|
|
|
logger = logging.getLogger(__name__)
|
|
logger_debug = logging.getLogger(__name__ + '.debug')
|
|
logger_debug.setLevel(logging.INFO)
|
|
|
|
|
|
re_url = re.compile(
|
|
r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
|
|
r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
|
|
)
|
|
|
|
|
|
async def parse_document(
|
|
durl: Durl,
|
|
resp: dict,
|
|
site: Optional[Site],
|
|
) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
|
|
"""
|
|
Extract plain text from documents in various formats.
|
|
"""
|
|
content = resp['content']
|
|
|
|
# HTTP headers, canonical URL, shortlink
|
|
header_links = await get_header_links(resp['headers'], durl, site)
|
|
if canonical := header_links.get('canonical'):
|
|
if canonical != durl.url():
|
|
return ResourceRedirect(resp['redirects'] + [canonical])
|
|
shortlink = header_links.get('shortlink')
|
|
|
|
# use tika to extract text
|
|
doc = parser.from_buffer(content)
|
|
# logger.debug(pformat(doc))
|
|
if doc.get('status') != 200:
|
|
msg = f'Analyzing document failed: {durl.url()}'
|
|
return ResourceError(msg)
|
|
|
|
# collect meta data
|
|
meta = doc.get('metadata', {})
|
|
content_type = meta.get('Content-Type')
|
|
if isinstance(content_type, list):
|
|
content_type = content_type[-1]
|
|
title = concat(meta.get('title'))
|
|
concat(meta.get('creator'))
|
|
last_change = extract_latest(meta.get('date') or meta.get('created'))
|
|
keywords = None
|
|
|
|
# text content
|
|
text = (doc.get('content') or '').strip()
|
|
|
|
# links
|
|
links_int: dict[Durl, tuple[list[str], str]] = {}
|
|
links_ext: dict[Durl, tuple[list[str], str]] = {}
|
|
for url in re_url.findall(text):
|
|
link_durl = await Durl(url[0])
|
|
if link_durl:
|
|
if link_durl.site() == durl.site():
|
|
links_int[link_durl] = [], link_durl.url()
|
|
else:
|
|
links_ext[link_durl] = [], link_durl.url()
|
|
|
|
# annotations
|
|
text, annotations = annotate_text(text)
|
|
|
|
return TextResource(
|
|
content_type=content_type,
|
|
last_change=last_change,
|
|
text_len=len(text),
|
|
lang=extract_content_language(text),
|
|
title=title,
|
|
init_fields={
|
|
'durl': durl,
|
|
'site': site,
|
|
'headers': resp['headers'],
|
|
'redirects': resp['redirects'],
|
|
'links_int': links_int,
|
|
'links_ext': links_ext,
|
|
'shortlink': shortlink,
|
|
'canonical': None,
|
|
},
|
|
search_fields={
|
|
'title': title,
|
|
'pub_date': last_change,
|
|
'keywords': keywords,
|
|
'text': text,
|
|
'annotations': annotations,
|
|
},
|
|
)
|
|
|
|
|
|
def extract_latest(s: Optional[Union[str, list]]) -> Optional[datetime]:
|
|
"""
|
|
Extract the lastest date (if any) from a string or list of strings.
|
|
"""
|
|
if not s:
|
|
return None
|
|
if not isinstance(s, list):
|
|
s = [s]
|
|
dt = []
|
|
for t in s:
|
|
try:
|
|
dt.append(datetime.fromisoformat(t.rstrip('Z')))
|
|
except:
|
|
pass
|
|
return max(dt) if dt else None
|
|
|
|
|
|
def concat(s: Optional[Union[str, list]]) -> Optional[str]:
|
|
"""
|
|
Helper function for joining strings together.
|
|
"""
|
|
if not s:
|
|
return None
|
|
if not isinstance(s, list):
|
|
s = [s]
|
|
return ' '.join(s)
|