atextcrawler/tests/simhash.py

55 lines
1.7 KiB
Python

"""
Test cases for text util.
"""
from unittest import TestCase
from simhash import Simhash, SimhashIndex
from atextcrawler.utils.similarity import (
create_simhash,
get_features,
get_simhash,
postgresql_bigint_offset,
search_simhash,
)
class SimhashTest(TestCase):
"""
Test simhash creation and search.
"""
def test_search(self):
n1 = int('1111111100000000', 2)
n2 = int('1111111100000111', 2)
n3 = int('1000000000000000', 2)
n4 = int('1000000000000111', 2)
n5 = int('1000001111000000', 2)
objs = [
('1', Simhash(n1)),
('3', Simhash(n3)),
('4', Simhash(n4)),
]
index = SimhashIndex(objs, k=3)
found = search_simhash(index, Simhash(n5))
self.assertEqual(found, [])
found = search_simhash(index, Simhash(n1))
self.assertEqual(found, [1])
found = search_simhash(index, Simhash(n2))
self.assertEqual(found, [1])
found = search_simhash(index, Simhash(n4))
self.assertEqual(found, [3, 4])
def test_create(self):
index = SimhashIndex([], k=3)
hash_val_1 = create_simhash(index, 101, get_simhash('hello ' * 20))
hash_val_2 = create_simhash(index, 102, get_simhash('another one'))
simhash_1 = Simhash(hash_val_1 + postgresql_bigint_offset)
simhash_2 = Simhash(hash_val_2 + postgresql_bigint_offset)
found = search_simhash(index, simhash_1)
self.assertEqual(found, [101])
found = search_simhash(index, simhash_2)
self.assertEqual(found, [102])
simhash_3 = get_simhash('hello ' * 20 + 'X')
found = search_simhash(index, simhash_3)
self.assertEqual(found, [101])