75 lines
2.5 KiB
Python
75 lines
2.5 KiB
Python
"""
|
|
Operations on text sections.
|
|
|
|
Semantic breaks are character positions within a text (0-offset)
|
|
where a new section begins. More precisely, the character position
|
|
contains a space and only at the next position begins a tag that is
|
|
semantically breaking (e.g., a h1 or a br).
|
|
|
|
Each semantic break has a level, which means breaking strength.
|
|
The lower the level (e.g., h1 has a lower level than h2), the
|
|
stronger the break.
|
|
|
|
Implicitly, if position 0 has no semantic break, a semantic break
|
|
at position 0 with level 80 is added.
|
|
|
|
Semantic breaks can be used to split a text into sections.
|
|
The lower the maximum level of the semantic breaks taken into account,
|
|
the coarser the segmentation and the fewer the sections.
|
|
Each section is given the level of the semantic break at ist beginning.
|
|
|
|
From another point of view, sections have levels indicating
|
|
the segmentation depth.
|
|
|
|
The levels for html tags are defined in tag.py.
|
|
|
|
The *semantic_breaks* argument in the functions below
|
|
is a dict mapping the character position of the semantic break
|
|
to the level of a section beginning at this position
|
|
(if segmentation is done at this or a higher level).
|
|
"""
|
|
|
|
|
|
def iter_sections(text, semantic_breaks, max_level=59):
|
|
"""
|
|
Iterate over sections, limiting to those with a maximum level.
|
|
|
|
Yield (start_pos, end_pos, level, text).
|
|
*text* is assumed to have the first semantic break at position 0.
|
|
"""
|
|
n = len(text)
|
|
last_pos = 0
|
|
last_level = semantic_breaks.get(0, 80)
|
|
for pos, level in sorted(semantic_breaks.items()):
|
|
if level <= max_level and last_pos != pos:
|
|
yield last_pos, pos, last_level, text[last_pos + 1 : pos]
|
|
last_pos = pos
|
|
last_level = level
|
|
if last_pos < n:
|
|
yield last_pos, n, last_level, text[last_pos:]
|
|
|
|
|
|
def concat_section_texts(text, semantic_breaks, min_len=2000):
|
|
"""
|
|
Try to concat consecutive sections into chunks with a minimum length.
|
|
|
|
Yield (section_ids, combined_text).
|
|
"""
|
|
n = len(text)
|
|
last_pos = 0
|
|
section_ids = []
|
|
for section_id, pos in enumerate(semantic_breaks.keys()):
|
|
if pos >= last_pos + min_len:
|
|
if n - pos < min_len:
|
|
for id_ in [
|
|
i for i, k in enumerate(semantic_breaks.keys()) if k >= pos
|
|
]:
|
|
section_ids.append(id_)
|
|
pos = n
|
|
yield section_ids, text[last_pos:pos]
|
|
last_pos = pos
|
|
section_ids = []
|
|
section_ids.append(section_id)
|
|
if last_pos < n:
|
|
yield section_ids, text[last_pos:]
|