atextcrawler/src/atextcrawler/resource/document.py

132 lines
3.6 KiB
Python

"""
Parse documents (often application/pdf).
"""
import logging
import re
from datetime import datetime
from typing import Optional, Union
from tika import parser
from ..models import ResourceError, ResourceRedirect, Site, TextResource
from ..utils.durl import Durl
from ..utils.http import get_header_links
from ..utils.lang import extract_content_language
from .plaintext import annotate_text
logger = logging.getLogger(__name__)
logger_debug = logging.getLogger(__name__ + '.debug')
logger_debug.setLevel(logging.INFO)
re_url = re.compile(
r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
)
async def parse_document(
durl: Durl,
resp: dict,
site: Optional[Site],
) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
"""
Extract plain text from documents in various formats.
"""
content = resp['content']
# HTTP headers, canonical URL, shortlink
header_links = await get_header_links(resp['headers'], durl, site)
if canonical := header_links.get('canonical'):
if canonical != durl.url():
return ResourceRedirect(resp['redirects'] + [canonical])
shortlink = header_links.get('shortlink')
# use tika to extract text
doc = parser.from_buffer(content)
# logger.debug(pformat(doc))
if doc.get('status') != 200:
msg = f'Analyzing document failed: {durl.url()}'
return ResourceError(msg)
# collect meta data
meta = doc.get('metadata', {})
content_type = meta.get('Content-Type')
if isinstance(content_type, list):
content_type = content_type[-1]
title = concat(meta.get('title'))
concat(meta.get('creator'))
last_change = extract_latest(meta.get('date') or meta.get('created'))
keywords = None
# text content
text = (doc.get('content') or '').strip()
# links
links_int: dict[Durl, tuple[list[str], str]] = {}
links_ext: dict[Durl, tuple[list[str], str]] = {}
for url in re_url.findall(text):
link_durl = await Durl(url[0])
if link_durl:
if link_durl.site() == durl.site():
links_int[link_durl] = [], link_durl.url()
else:
links_ext[link_durl] = [], link_durl.url()
# annotations
text, annotations = annotate_text(text)
return TextResource(
content_type=content_type,
last_change=last_change,
text_len=len(text),
lang=extract_content_language(text),
title=title,
init_fields={
'durl': durl,
'site': site,
'headers': resp['headers'],
'redirects': resp['redirects'],
'links_int': links_int,
'links_ext': links_ext,
'shortlink': shortlink,
'canonical': None,
},
search_fields={
'title': title,
'pub_date': last_change,
'keywords': keywords,
'text': text,
'annotations': annotations,
},
)
def extract_latest(s: Optional[Union[str, list]]) -> Optional[datetime]:
"""
Extract the lastest date (if any) from a string or list of strings.
"""
if not s:
return None
if not isinstance(s, list):
s = [s]
dt = []
for t in s:
try:
dt.append(datetime.fromisoformat(t.rstrip('Z')))
except:
pass
return max(dt) if dt else None
def concat(s: Optional[Union[str, list]]) -> Optional[str]:
"""
Helper function for joining strings together.
"""
if not s:
return None
if not isinstance(s, list):
s = [s]
return ' '.join(s)