diff --git a/doc/source/config_template/main.yaml b/doc/source/config_template/main.yaml index 3e876cd..3729180 100644 --- a/doc/source/config_template/main.yaml +++ b/doc/source/config_template/main.yaml @@ -70,6 +70,12 @@ crawl: # Allowed values: positive number #feed_crawl_interval: 86400 + # Minimum length of the text (in characters) extracted from + # a resource; resources with shorter texts are not stored. + # Default value: 300 + # Allowed values: positive number + #min_text_length: 300 + # Parameters for access to the ElasticSearch service # No default values; must be set. elasticsearch: diff --git a/src/atextcrawler/config.py b/src/atextcrawler/config.py index 0a07727..afef06f 100644 --- a/src/atextcrawler/config.py +++ b/src/atextcrawler/config.py @@ -278,6 +278,7 @@ schema_crawl = Schema( Required('resource_delay', default=5): positive_number, Required('full_crawl_interval', default=864000): positive_number, Required('feed_crawl_interval', default=86400): positive_number, + Required('min_text_length', default=300): positive_number, } ) diff --git a/src/atextcrawler/resource/operations.py b/src/atextcrawler/resource/operations.py index 078a668..0ed3a22 100644 --- a/src/atextcrawler/resource/operations.py +++ b/src/atextcrawler/resource/operations.py @@ -143,7 +143,7 @@ async def process_site_path( """ Fetch a path, deduplicate and if canonical, update and index the resource. - Return whether a new resource was handled that should contribute be + Return whether a new resource was handled that should contribute to statistics. """ msg = ( @@ -241,10 +241,17 @@ async def _handle_text_resource( # find resources similar to the current text text = resource.search_fields['text'] - if len(text) < 300: # discard resources with too short texts - site_path.resource_id = None + + # discard resources with too short texts + if len(text) < app.config['crawl']['min_text_length']: + await site_path.unlink_resource( + conn, + app.search_engine, + app.config['elasticsearch']['index_base_name'], + ) await site_path.save(conn) return False, False + simhash = simhash_from_bigint(resource.simhash) index = site.simhash_index similar_ids = search_simhash(index, simhash)