190 lines
3.5 KiB
Python
190 lines
3.5 KiB
Python
"""
|
|
Information collections related to html tags.
|
|
"""
|
|
|
|
|
|
drop_tags = [
|
|
'applet',
|
|
'area',
|
|
'audio',
|
|
'base',
|
|
'basefont',
|
|
'bdi',
|
|
'bdo',
|
|
'button',
|
|
'canvas',
|
|
'code',
|
|
'command',
|
|
'data',
|
|
'datalist',
|
|
'dir',
|
|
'embed',
|
|
'fieldset',
|
|
'figure',
|
|
'form',
|
|
'frame',
|
|
'frameset',
|
|
'iframe',
|
|
'img',
|
|
'input',
|
|
'label',
|
|
'legend',
|
|
'map',
|
|
'menuitem',
|
|
'meter',
|
|
'noframes',
|
|
'noscript',
|
|
'object',
|
|
'optgroup',
|
|
'option',
|
|
'param',
|
|
'picture',
|
|
'progress',
|
|
'rp',
|
|
'rt',
|
|
'ruby',
|
|
'samp',
|
|
'script',
|
|
'select',
|
|
'source',
|
|
'style',
|
|
'svg',
|
|
'template',
|
|
'textarea',
|
|
'track',
|
|
'var',
|
|
'video',
|
|
]
|
|
"""
|
|
Tags to drop, including their content.
|
|
"""
|
|
|
|
|
|
keep_tags = {
|
|
'a': (0, 0, ''),
|
|
'abbr': (0, 0, 'st'),
|
|
'acronym': (0, 0, 'st'),
|
|
'address': (1, 0, 'm'),
|
|
'article': (1, 15, ''),
|
|
'aside': (1, 0, 'd'),
|
|
'b': (0, 0, 'st'),
|
|
'blockquote': (1, 65, 'q'),
|
|
'br': (1, 80, ''),
|
|
'caption': (1, 68, ''),
|
|
'center': (1, 50, ''),
|
|
'cite': (1, 0, 'd'),
|
|
'col': (1, 75, ''),
|
|
'colgroup': (1, 73, ''),
|
|
'dd': (1, 70, 'li'),
|
|
'del': (0, 0, 'se'),
|
|
'details': (1, 0, 'd'),
|
|
'dfn': (0, 0, 'st'),
|
|
'div': (1, 60, ''), # lvl often revised to min of contained tags
|
|
'dl': (1, 70, 'l'),
|
|
'dt': (1, 70, 'li'),
|
|
'em': (0, 0, 'st'),
|
|
'figcaption': (1, 0, ''),
|
|
'font': (0, 0, 's'),
|
|
'footer': (1, 15, ''),
|
|
'h1': (1, 30, ''),
|
|
'h2': (1, 32, ''),
|
|
'h3': (1, 34, ''),
|
|
'h4': (1, 36, ''),
|
|
'h5': (1, 38, ''),
|
|
'h6': (1, 40, ''),
|
|
'header': (1, 15, ''),
|
|
'hr': (1, 30, ''),
|
|
'i': (0, 0, 'st'),
|
|
'ins': (0, 0, 'se'),
|
|
'li': (1, 75, 'li'), # lvl revised if not inside p
|
|
'main': (1, 10, ''),
|
|
'mark': (0, 0, 's'),
|
|
'nav': (1, 0, ''), # keep for footnotes
|
|
'ol': (1, 70, 'l'), # lvl revised if not inside p
|
|
'p': (1, 60, ''),
|
|
'pre': (1, 65, 'q'),
|
|
'q': (1, 0, 'q'),
|
|
's': (0, 0, ''),
|
|
'section': (1, 24, ''),
|
|
'small': (0, 0, 'd'),
|
|
'span': (0, 0, 's'),
|
|
'strike': (0, 0, 'se'),
|
|
'strong': (0, 0, 'st'),
|
|
'sub': (0, 0, ''),
|
|
'summary': (1, 20, 'm'),
|
|
'sup': (0, 0, ''),
|
|
'table': (1, 65, ''),
|
|
'tbody': (1, 70, ''),
|
|
'td': (1, 78, ''),
|
|
'tfoot': (1, 70, ''),
|
|
'th': (1, 75, ''),
|
|
'thead': (1, 70, ''),
|
|
'time': (0, 0, 'm'),
|
|
'tr': (1, 75, ''),
|
|
'u': (0, 0, 's'),
|
|
'ul': (1, 70, 'l'), # lvl revised if not inside p
|
|
}
|
|
"""
|
|
Tags to keep for annotation, and their properties.
|
|
|
|
The properties are:
|
|
|
|
* sep: whether to separate text at both sides of the tag with a space
|
|
* lvl: structural depth level of content of this tag;
|
|
the paragraph level is 60; headings are below 60, listings above;
|
|
a div below the tag will usually have the tag's depth + 1
|
|
* sem: semantic categories: zero or more of
|
|
* s=span
|
|
* l=listing
|
|
* i=list_item
|
|
* t=term
|
|
* e=edit
|
|
* d=details
|
|
* q=quote
|
|
* m=meta
|
|
* x=exclude
|
|
"""
|
|
|
|
|
|
self_closing_tags = ('br', 'hr')
|
|
"""
|
|
Those among keep_tags which are self-closing.
|
|
"""
|
|
|
|
|
|
all_self_closing_tags = (
|
|
'area',
|
|
'base',
|
|
'br',
|
|
'col',
|
|
'embed',
|
|
'hr',
|
|
'img',
|
|
'input',
|
|
'link',
|
|
'meta',
|
|
'param',
|
|
'source',
|
|
'track',
|
|
'wbr',
|
|
)
|
|
"""
|
|
All self-closing tags of the html standard.
|
|
"""
|
|
|
|
|
|
drop_roles = (
|
|
'banner',
|
|
'complementary',
|
|
'contentinfo',
|
|
'dialog',
|
|
'figure',
|
|
'form',
|
|
'img',
|
|
'search',
|
|
'switch',
|
|
)
|
|
"""
|
|
Drop tags with these aria roles.
|
|
"""
|