atextcrawler/src/atextcrawler/utils/section.py

"""
Operations on text sections.

Semantic breaks are character positions within a text (0-offset)
where a new section begins. More precisely, the character position
contains a space and only at the next position begins a tag that is
semantically breaking (e.g., a h1 or a br).

Each semantic break has a level, which means breaking strength.
The lower the level (e.g., h1 has a lower level than h2), the
stronger the break.

Implicitly, if position 0 has no semantic break, a semantic break
at position 0 with level 80 is added.

Semantic breaks can be used to split a text into sections.
The lower the maximum level of the semantic breaks taken into account,
the coarser the segmentation and the fewer the sections.
Each section is given the level of the semantic break at ist beginning.

From another point of view, sections have levels indicating
the segmentation depth.

The levels for html tags are defined in tag.py.

The *semantic_breaks* argument in the functions below
is a dict mapping the character position of the semantic break
to the level of a section beginning at this position
(if segmentation is done at this or a higher level).
"""


def iter_sections(text, semantic_breaks, max_level=59):
    """
    Iterate over sections, limiting to those with a maximum level.

    Yield (start_pos, end_pos, level, text).
    *text* is assumed to have the first semantic break at position 0.
    """
    n = len(text)
    last_pos = 0
    last_level = semantic_breaks.get(0, 80)
    for pos, level in sorted(semantic_breaks.items()):
        if level <= max_level and last_pos != pos:
            yield last_pos, pos, last_level, text[last_pos + 1 : pos]
            last_pos = pos
            last_level = level
    if last_pos < n:
        yield last_pos, n, last_level, text[last_pos:]


def concat_section_texts(text, semantic_breaks, min_len=2000):
    """
    Try to concat consecutive sections into chunks with a minimum length.

    Yield (section_ids, combined_text).
    """
    n = len(text)
    last_pos = 0
    section_ids = []
    for section_id, pos in enumerate(semantic_breaks.keys()):
        if pos >= last_pos + min_len:
            if n - pos < min_len:
                for id_ in [
                    i for i, k in enumerate(semantic_breaks.keys()) if k >= pos
                ]:
                    section_ids.append(id_)
                pos = n
            yield section_ids, text[last_pos:pos]
            last_pos = pos
            section_ids = []
        section_ids.append(section_id)
    if last_pos < n:
        yield section_ids, text[last_pos:]