""" Test cases for text util. """ from unittest import TestCase from simhash import Simhash, SimhashIndex from atextcrawler.utils.similarity import ( create_simhash, get_features, get_simhash, postgresql_bigint_offset, search_simhash, ) class SimhashTest(TestCase): """ Test simhash creation and search. """ def test_search(self): n1 = int('1111111100000000', 2) n2 = int('1111111100000111', 2) n3 = int('1000000000000000', 2) n4 = int('1000000000000111', 2) n5 = int('1000001111000000', 2) objs = [ ('1', Simhash(n1)), ('3', Simhash(n3)), ('4', Simhash(n4)), ] index = SimhashIndex(objs, k=3) found = search_simhash(index, Simhash(n5)) self.assertEqual(found, []) found = search_simhash(index, Simhash(n1)) self.assertEqual(found, [1]) found = search_simhash(index, Simhash(n2)) self.assertEqual(found, [1]) found = search_simhash(index, Simhash(n4)) self.assertEqual(found, [3, 4]) def test_create(self): index = SimhashIndex([], k=3) hash_val_1 = create_simhash(index, 101, get_simhash('hello ' * 20)) hash_val_2 = create_simhash(index, 102, get_simhash('another one')) simhash_1 = Simhash(hash_val_1 + postgresql_bigint_offset) simhash_2 = Simhash(hash_val_2 + postgresql_bigint_offset) found = search_simhash(index, simhash_1) self.assertEqual(found, [101]) found = search_simhash(index, simhash_2) self.assertEqual(found, [102]) simhash_3 = get_simhash('hello ' * 20 + 'X') found = search_simhash(index, simhash_3) self.assertEqual(found, [101])