atextcrawler/src/atextcrawler/utils/tag.py

190 lines
3.5 KiB
Python

"""
Information collections related to html tags.
"""
drop_tags = [
'applet',
'area',
'audio',
'base',
'basefont',
'bdi',
'bdo',
'button',
'canvas',
'code',
'command',
'data',
'datalist',
'dir',
'embed',
'fieldset',
'figure',
'form',
'frame',
'frameset',
'iframe',
'img',
'input',
'label',
'legend',
'map',
'menuitem',
'meter',
'noframes',
'noscript',
'object',
'optgroup',
'option',
'param',
'picture',
'progress',
'rp',
'rt',
'ruby',
'samp',
'script',
'select',
'source',
'style',
'svg',
'template',
'textarea',
'track',
'var',
'video',
]
"""
Tags to drop, including their content.
"""
keep_tags = {
'a': (0, 0, ''),
'abbr': (0, 0, 'st'),
'acronym': (0, 0, 'st'),
'address': (1, 0, 'm'),
'article': (1, 15, ''),
'aside': (1, 0, 'd'),
'b': (0, 0, 'st'),
'blockquote': (1, 65, 'q'),
'br': (1, 80, ''),
'caption': (1, 68, ''),
'center': (1, 50, ''),
'cite': (1, 0, 'd'),
'col': (1, 75, ''),
'colgroup': (1, 73, ''),
'dd': (1, 70, 'li'),
'del': (0, 0, 'se'),
'details': (1, 0, 'd'),
'dfn': (0, 0, 'st'),
'div': (1, 60, ''), # lvl often revised to min of contained tags
'dl': (1, 70, 'l'),
'dt': (1, 70, 'li'),
'em': (0, 0, 'st'),
'figcaption': (1, 0, ''),
'font': (0, 0, 's'),
'footer': (1, 15, ''),
'h1': (1, 30, ''),
'h2': (1, 32, ''),
'h3': (1, 34, ''),
'h4': (1, 36, ''),
'h5': (1, 38, ''),
'h6': (1, 40, ''),
'header': (1, 15, ''),
'hr': (1, 30, ''),
'i': (0, 0, 'st'),
'ins': (0, 0, 'se'),
'li': (1, 75, 'li'), # lvl revised if not inside p
'main': (1, 10, ''),
'mark': (0, 0, 's'),
'nav': (1, 0, ''), # keep for footnotes
'ol': (1, 70, 'l'), # lvl revised if not inside p
'p': (1, 60, ''),
'pre': (1, 65, 'q'),
'q': (1, 0, 'q'),
's': (0, 0, ''),
'section': (1, 24, ''),
'small': (0, 0, 'd'),
'span': (0, 0, 's'),
'strike': (0, 0, 'se'),
'strong': (0, 0, 'st'),
'sub': (0, 0, ''),
'summary': (1, 20, 'm'),
'sup': (0, 0, ''),
'table': (1, 65, ''),
'tbody': (1, 70, ''),
'td': (1, 78, ''),
'tfoot': (1, 70, ''),
'th': (1, 75, ''),
'thead': (1, 70, ''),
'time': (0, 0, 'm'),
'tr': (1, 75, ''),
'u': (0, 0, 's'),
'ul': (1, 70, 'l'), # lvl revised if not inside p
}
"""
Tags to keep for annotation, and their properties.
The properties are:
* sep: whether to separate text at both sides of the tag with a space
* lvl: structural depth level of content of this tag;
the paragraph level is 60; headings are below 60, listings above;
a div below the tag will usually have the tag's depth + 1
* sem: semantic categories: zero or more of
* s=span
* l=listing
* i=list_item
* t=term
* e=edit
* d=details
* q=quote
* m=meta
* x=exclude
"""
self_closing_tags = ('br', 'hr')
"""
Those among keep_tags which are self-closing.
"""
all_self_closing_tags = (
'area',
'base',
'br',
'col',
'embed',
'hr',
'img',
'input',
'link',
'meta',
'param',
'source',
'track',
'wbr',
)
"""
All self-closing tags of the html standard.
"""
drop_roles = (
'banner',
'complementary',
'contentinfo',
'dialog',
'figure',
'form',
'img',
'search',
'switch',
)
"""
Drop tags with these aria roles.
"""