""" Text similarity with simhash. """ import logging from asyncpg import Connection from simhash import Simhash, SimhashIndex logger = logging.getLogger(__name__) logger.setLevel(logging.ERROR) postgresql_bigint_offset = 9223372036854775808 """ Subtract this number to get a PostgreSQL bigint from a 64bit int. """ def get_features(txt: str) -> list[str]: """ Extract features from string for use with Simhash. """ width = 3 txt = txt.replace(' ', '').lower() return [txt[i : i + width] for i in range(max(len(txt) - width + 1, 1))] def simhash_to_bigint(simhash: Simhash) -> int: """ Convert a simhash to PostgreSQL's bigint value range. """ return simhash.value - postgresql_bigint_offset def simhash_from_bigint(bigint: int) -> Simhash: """ Convert a simhash from PostgreSQL's bigint to a Simhash instance. """ return Simhash(bigint + postgresql_bigint_offset, log=logger) def get_simhash(text: str) -> Simhash: """ Return the Simhash of the given text. """ return Simhash(get_features(text), log=logger) async def get_simhash_index(conn: Connection, site_id: int) -> SimhashIndex: """ Return a simhash index with hashes of all stored resources of the site. """ sql = ( "SELECT r.id, r.simhash FROM site_path sp, resource r" " WHERE sp.site_id=$1 AND sp.resource_id=r.id" ) rows = await conn.fetch(sql, site_id) objs = [ ( str(row['id']), Simhash(row['simhash'] + postgresql_bigint_offset, log=logger), ) for row in rows ] return SimhashIndex(objs, k=3, log=logger) def create_simhash( index: SimhashIndex, resource_id: int, simhash_instance: Simhash, ) -> int: """ Add a resource with given id and simhash to a simhash index. Return the simhash value shifted into PostgreSQL's bigint range. (The simhash field of the resource's database entry is not updated.) """ index.add(str(resource_id), simhash_instance) return simhash_to_bigint(simhash_instance) def search_simhash(index: SimhashIndex, simhash_inst: Simhash) -> list[int]: """ Return the ids of similar resources from the index. """ found = index.get_near_dups(simhash_inst) if found: return sorted([int(elem) for elem in found]) return []