""" Convert html to plain text with annotations over character ranges. """ import re from collections import defaultdict from html.parser import HTMLParser from .json import json_dumps, json_loads from .link import nofollow_link_rels from .tag import keep_tags, self_closing_tags MAX_HREF_LENGTH = 200 """ Maximum length of an href. Other links are discarded. """ text_blacklist = [ 'previous', 'next', 'back', # common pagination navigation '↩︎', # amusewiki footnote separator (after conversion from muse to html) ] """ Texts to ignore. """ class AnnotatingParser(HTMLParser): """ Parse tagged text resulting in pure text and annotations. The text is available in self.text and the annotations in self.annotations, which is a dict with these keys: * tags: contains a mapping of offset ranges (i, f) to the tags opening at i and closing at f * semantic_breaks: a mapping of offset positions where a new section begins to the nesting level of that sections; a section is whereever an (opening or closing) separating tag is placed in the raw html; for the separating flag of tags see tag.py * links: a mapping of hrefs to link texts obtained from anchor (a) tags; we skip hyperref with nofollow rels * section_ids: map an offset position to the first id attribute (of any tag) at the beginning of a semantic section; this can later be used in a URL fragment for linking directly into this section Internally, we put opening tags on self.stack and pop them when the first matching closing tag is encountered. We assume balanced tags (tidy html). NB: all tags with semantic breaks have sep=True, i.e., they will have spaces around them so that the semantic breaks always sit on a space; the semantic break position p is the end of the last section and the next sections begins at p + 1. The text alway begins with a ' ' (added if not in the original), which is assigned a semantic break with default level 80 (if there is no semantic break tag at the beginning). """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.text = ' ' # concatenated text data (without tags) self.pos = 1 # equal to len(self.text) self.stack = [] self.tags = defaultdict(dict) self.semantic_breaks = {0: 80} self.tag_id = None self.section_ids = defaultdict(list) self.links = {} self.add_space = False def close(self): """ Finish by collecting results in dict `self.annotations`. """ super().close() self.annotations = {} self.annotations['links'] = self.links self.annotations['semantic_breaks'] = { pos: lvl for pos, lvl in sorted(self.semantic_breaks.items()) } self.annotations['tags'] = self.tags self.annotations['section_ids'] = self.section_ids def handle_starttag(self, tag, attrs): """ Called for each opening tag. """ sep, lvl, sem = keep_tags[tag] attrs = dict(attrs) if sep: self.add_space = True if tag == 'section' and 'endnotes' in attrs.get('role', ''): lvl = 25 # ARIA roles if role := attrs.get('role'): if role == 'article': lvl = 15 elif role == 'heading': if aria_level := attrs.get('aria-level'): if aria_level in (1, 2, 3, 4, 5, 6): sep, lvl, sem = keep_tags[f'h{aria_level}'] elif role == 'region': lvl = 24 i = self.pos if tag in self_closing_tags: # self-closing tags will not be added to the result tags, # they only appear in semantic_breaks # the two self-closing tags br and hr both have lvl and sep if i == 1: # replace the default semantic break at pos 0 i = 0 self.add_semantic_break(i, lvl) i += 1 if tag_id := attrs.get('id'): self.tag_id = i, tag_id self.add_tag_id(i) # br or hr may have an id, too self.add_space = True else: self.stack.append((i, tag, sep, lvl, sem, attrs)) # forget outdated tag id at new semantic break if lvl: self.forget_tag_id() # memorize tag id if not self.tag_id and (tag_id := attrs.get('id')): self.tag_id = self.pos, tag_id def handle_endtag(self, tag): """ Called for each closing tag. """ if not self.stack or (self.stack and self.stack[-1][1] != tag): return # nothing to do for an already closed self-closing tag i, tag_, sep, lvl, sem, attrs = self.stack.pop() f = self.pos # omit tag without content if i == f: return # for a closing div tag revise lvl to minimum level of contained # semantic breaks (if any) if tag == 'div': min_lvl = 101 for pos_, lvl_ in reversed(self.semantic_breaks.items()): if pos_ <= i: break min_lvl = min(min_lvl, lvl_) if min_lvl < 101: lvl = min_lvl # add semantic break and an optional section_id if lvl: if i == 1: # replace the default semantic break at pos 0 i = 0 if tag in ('ul', 'ol', 'li'): seen_tags = [x[1] for x in self.stack] if 'p' not in seen_tags: lvl = 52 + seen_tags.count('tag') if tag == 'li': lvl += 1 self.add_semantic_break(i, lvl) self.add_tag_id(i) # do not include surrounding spaces in tag span if self.text[i] == ' ': i += 1 # add tag self.tags[(i, f)][tag] = sem # add space (when handling next data) if sep: self.add_space = True # collect links if tag == 'a': self.extract_link(i, attrs) def handle_data(self, text): """ Called for each non-tag content between tags. """ # handle empty or blacklisted text if text == '': return if text == ' ': self.add_space = True return if text.strip().lower() in text_blacklist: if ' ' in text: self.add_space = True return # add a space (at self.pos) if the text begins with one # or if we shall add one startswith_space = text.startswith(' ') text = text.lstrip() if startswith_space or self.add_space: if self.text[-1] != ' ': self.text += ' ' self.pos += 1 self.add_space = False # strip a space at the end of text and handle it in end tag if text.endswith(' '): text = text[:-1] self.add_space = True # add text to self.text self.text += text self.pos += len(text) def add_semantic_break(self, pos, lvl): """ Add a semantic break of level *lvl* at position *pos*. """ if pos in self.semantic_breaks: self.semantic_breaks[pos] = min(self.semantic_breaks[pos], lvl) else: self.semantic_breaks[pos] = lvl def forget_tag_id(self): """ Reset a tag id if it is too far behind in the text stream. """ if self.tag_id: pos_, tag_id = self.tag_id if pos_ + 200 < self.pos: self.tag_id = None def add_tag_id(self, pos): """ Add and clear an id if the just closing section has none yet. *pos* is the start position of the current section, and the position where the id will be added. Add an id only if we are not too far in the section's text already. """ if self.tag_id: pos_, tag_id = self.tag_id if pos_ < pos + 100 and pos not in self.section_ids: self.section_ids[pos].append(tag_id.lower()) self.tag_id = None def extract_link(self, i, attrs): """ Add a link covering character range (i, self.pos). From html *attrs* extract href and rel. """ if (href := attrs.get('href')) and not attrs.get('rel') == 'nofollow': if href.startswith('#'): return if len(href) > MAX_HREF_LENGTH: return attrs.get('title', '') if rel := attrs.get('rel'): if set(rel) & nofollow_link_rels: return self.links[href] = i, self.pos, rel def annotate(html): """ Split html text into plain text with annotations (from AnnotatingParser). """ parser = AnnotatingParser() parser.reset() parser.feed(html) parser.close() return parser.text, parser.annotations re_footnote = re.compile(r'^\s*\[\d+\]\s+') def headline_probability(text, tags, lvl) -> float: """ Estimate the probability that the text with tags is a headline. The context is not considered: The question is not whether the text is a headline for the following text. """ text = text.strip() res = 0.0 if not text: return res if lvl < 60: return 1.0 # if 'h1' in tags or 'h2' in tags or 'h3' in tags or\ # 'h4' in tags or 'h5' in tags or 'h6' in tags or 'center' in tags: # return 1.0 if len(text) < 80: res = 0.7 else: res = 0.7 - 0.7 * (len(text) - 80) / 200 if 'p' in tags: res -= 0.4 if 'em' in tags: res += 0.3 if 'a' in tags: res -= 0.1 if text[-1] in '.:': res -= 0.3 res -= 0.1 * text.count(', ') if re_footnote.match(text): res -= 0.4 return max(res, 0.0) def get_tag_counts(tag_names, i, f, tags, text) -> tuple[int, float, float]: """ Return the info on the share of characters covered with one of the *tags*. Only consider the characters between i and f of string *text*. Return the number of tags that have an overlap in the specified region, the tag density in the region (fraction of covered characters by all), and the average number of covered chars per tag. NB: If more than one tag name is given, then the fractional share may exceed 1. """ if i == f: return 0, 0.0, 0.0 tag_count = 0 covered_chars = 0 for (s_i, s_f), anns in tags.items(): if overlap := range_overlap(i, f - 1, s_i, s_f - 1): for ann in anns: if ann in tag_names: tag_count += 1 covered_chars += overlap[1] - overlap[0] all_chars = f - i tag_density = covered_chars * 1.0 / all_chars avg_text_len = covered_chars * 1.0 / tag_count if tag_count else 0 return tag_count, tag_density, avg_text_len def range_overlap(i1, f1, i2, f2): """ Return the overlap of both ranges (None if there is none). """ return None if f1 <= i2 or f2 <= i1 else (max(i1, i2), min(f1, f2)) def annotations_remove_section(annotations, i, f): """ Remove section (i, f) from annotations and return result. """ new_annotations = {} d = f - i if not d: return annotations # relocate tags new_tags = {} for (t_i, t_f), anns in annotations['tags'].items(): n_i, n_f = cut_range(i, f, d, t_i, t_f) if n_i is not None: new_tags[(n_i, n_f)] = anns new_annotations['tags'] = new_tags # relocate links new_links = {} for href, (l_i, l_f, rel) in annotations['links'].items(): n_i, n_f = cut_range(i, f, d, l_i, l_f) if n_i is not None: new_links[href] = n_i, n_f, rel # relocate semantic breaks and section_ids semantic_breaks = annotations['semantic_breaks'] section_ids = annotations['section_ids'] new_semantic_breaks = {} new_section_ids = {} for pos in sorted(semantic_breaks.keys()): level = semantic_breaks[pos] if i <= pos and pos < f: continue # discard elif f <= pos: new_semantic_breaks[pos - d] = level if pos in section_ids: new_section_ids[pos - d] = section_ids[pos] else: new_semantic_breaks[pos] = level if pos in section_ids: new_section_ids[pos] = section_ids[pos] # collect and return results new_annotations['semantic_breaks'] = new_semantic_breaks new_annotations['section_ids'] = new_section_ids new_annotations['links'] = new_links return new_annotations def cut_range(i, f, d, t_i, t_f): """ Return the new coordinates of a text range (t_i,t_f) after cutting (i,f). If (t_i,t_f) is fully within (i,f), return None, None. """ if t_f < i: return t_i, t_f elif t_i < i <= t_f <= f: return t_i, i elif t_i < i and f <= t_f: return t_i, t_f - d elif i <= t_i and t_f <= f: return None, None elif i <= t_i <= f < t_f: return i, t_f - d else: # f < t_i return t_i - d, t_f - d def clean_annotations(annotations: dict) -> None: """ Remove void stuff from annotations. """ cleaned_tags = {} for (i, f), anns in annotations['tags'].items(): if f > i and anns: cleaned_tags[(i, f)] = anns annotations['tags'] = cleaned_tags def pack_annotations(annotations): """ Pack annotations to a special JSON string, reducing their volume a little. """ return json_dumps( { 'tags': _pack_tags(annotations['tags']), 'semantic_breaks': ','.join( [ f'{pos}:{level}' for pos, level in annotations['semantic_breaks'].items() ] ), 'section_ids': annotations['section_ids'], 'links': annotations['links'], } ) def _pack_tags(tags: dict) -> str: """ Utility function for packing tag information into a string. """ res = '' for (i, f), anns in tags.items(): if anns: anns_ = ','.join([f'{tag}={sem}' for tag, sem in anns.items()]) res += f'{i}-{f}:{anns_}\n' return res def unpack_annotations(json_text: str) -> dict: """ Unpack tag information from a string. """ annotations = json_loads(json_text) tags = {} for line in annotations['tags'].split('\n'): if line: range_, anns_ = line.split(':') i, f = range_.split('-') i = int(i) f = int(f) anns = {} if anns_: for ann_ in anns_.split(','): tag_, sem_ = ann_.split('=') anns[tag_] = sem_ tags[(i, f)] = anns semantic_breaks = {} for sb_ in annotations['semantic_breaks'].split(','): pos_, lvl_ = sb_.split(':') semantic_breaks[int(pos_)] = int(lvl_) return { 'tags': tags, 'semantic_breaks': semantic_breaks, 'section_ids': annotations['section_ids'], 'links': annotations['links'], }