atextcrawler/tests/annotation.py

50 lines
1.6 KiB
Python

"""
Test cases for resource type page.
"""
from unittest import TestCase
from atextcrawler.utils.annotation import annotate
class AnnotateTest(TestCase):
"""
Test annotation.
Consider that the <br> and <hr> tags are self-closing.
"""
def test_annotate_1(self):
s = '<em>Hello</em><br><strong>world</strong>'
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
self.assertEqual(anns['section_ids'], {})
def test_annotate_2(self):
s = '<em> Hello </em><br><strong> world </strong>'
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
self.assertEqual(anns['section_ids'], {})
def test_annotate_3(self):
s = '<p> Hello <em>world</em> </p> '
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 60})
def test_annotate_4(self):
s = '<div id = "ref1"><p>Hello <em>world</em> </p> </div>'
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 60})
self.assertEqual(anns['section_ids'], {0: ['ref1']})
def test_annotate_5(self):
s = '<br id="ref2"> Hello <p>world </p> '
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 60})
self.assertEqual(anns['section_ids'], {1: ['ref2']})