""" Parse documents (often application/pdf). """ import logging import re from datetime import datetime from typing import Optional, Union from tika import parser from ..models import ResourceError, ResourceRedirect, Site, TextResource from ..utils.durl import Durl from ..utils.http import get_header_links from ..utils.lang import extract_content_language from .plaintext import annotate_text logger = logging.getLogger(__name__) logger_debug = logging.getLogger(__name__ + '.debug') logger_debug.setLevel(logging.INFO) re_url = re.compile( r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?' r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)' ) async def parse_document( durl: Durl, resp: dict, site: Optional[Site], ) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]: """ Extract plain text from documents in various formats. """ content = resp['content'] # HTTP headers, canonical URL, shortlink header_links = await get_header_links(resp['headers'], durl, site) if canonical := header_links.get('canonical'): if canonical != durl.url(): return ResourceRedirect(resp['redirects'] + [canonical]) shortlink = header_links.get('shortlink') # use tika to extract text doc = parser.from_buffer(content) # logger.debug(pformat(doc)) if doc.get('status') != 200: msg = f'Analyzing document failed: {durl.url()}' return ResourceError(msg) # collect meta data meta = doc.get('metadata', {}) content_type = meta.get('Content-Type') if isinstance(content_type, list): content_type = content_type[-1] title = concat(meta.get('title')) concat(meta.get('creator')) last_change = extract_latest(meta.get('date') or meta.get('created')) keywords = None # text content text = (doc.get('content') or '').strip() # links links_int: dict[Durl, tuple[list[str], str]] = {} links_ext: dict[Durl, tuple[list[str], str]] = {} for url in re_url.findall(text): link_durl = await Durl(url[0]) if link_durl: if link_durl.site() == durl.site(): links_int[link_durl] = [], link_durl.url() else: links_ext[link_durl] = [], link_durl.url() # annotations text, annotations = annotate_text(text) return TextResource( content_type=content_type, last_change=last_change, text_len=len(text), lang=extract_content_language(text), title=title, init_fields={ 'durl': durl, 'site': site, 'headers': resp['headers'], 'redirects': resp['redirects'], 'links_int': links_int, 'links_ext': links_ext, 'shortlink': shortlink, 'canonical': None, }, search_fields={ 'title': title, 'pub_date': last_change, 'keywords': keywords, 'text': text, 'annotations': annotations, }, ) def extract_latest(s: Optional[Union[str, list]]) -> Optional[datetime]: """ Extract the lastest date (if any) from a string or list of strings. """ if not s: return None if not isinstance(s, list): s = [s] dt = [] for t in s: try: dt.append(datetime.fromisoformat(t.rstrip('Z'))) except: pass return max(dt) if dt else None def concat(s: Optional[Union[str, list]]) -> Optional[str]: """ Helper function for joining strings together. """ if not s: return None if not isinstance(s, list): s = [s] return ' '.join(s)