atextcrawler/tests/text.py

66 lines
2.0 KiB
Python

"""
Test cases for text util.
"""
from unittest import TestCase
from atextcrawler.utils.html import clean_page
class CleanHtmlTest(TestCase):
"""
Test clean_page.
Have an eye on self-closing tags (br, hr, ...).
"""
def test_clean_page_1(self):
s = '<em>Hello</em><br><script>malicious="<script>"</script>anything'
r = '<em>Hello</em><br/>anything'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_2(self):
s = '<em>Hello</em><br /><script>malicious<script></script>anything'
r = '<em>Hello</em><br/>anything'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_3(self):
# nesting
s = '--<figure>xx<figure>yy</figure>zz</figure>..'
r = '--..'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_4(self):
# aria-hidden
s = '--<p aria-hidden=true>xx</p>..'
r = '--..'
self.assertEqual(str(clean_page(s)), r)
s = '--<p aria-hidden="true">xx</p>..'
r = '--..'
self.assertEqual(str(clean_page(s)), r)
s = '--<p aria-hidden=false>xx</p>..'
r = '--<p aria-hidden="false">xx</p>..'
self.assertEqual(str(clean_page(s)), r)
s = '--<p aria-hidden="false">xx</p>..'
r = '--<p aria-hidden="false">xx</p>..'
self.assertEqual(str(clean_page(s)), r)
s = '--<p aria-hidden=??>xx</p>..'
r = '--<p aria-hidden="??">xx</p>..'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_5(self):
# no removal
s = '--<p>xx<em>yy</em></p>..'
r = '--<p>xx<em>yy</em></p>..'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_6(self):
# self-closing tags to be removed
s = '--<area /><p>xx</p>\n...<h1>tt<area /></h1>nn'
r = '--<p>xx</p>\n...<h1>tt</h1>nn'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_7(self):
s = '--<p rel=search>tt<area /></p>nn'
r = '--nn'
self.assertEqual(str(clean_page(s)), r)