Add resource length to elasticsearch index

Make minimum text length configurable and actually remove elasticsearch documents
2022-01-02 14:44:44 +00:00 · 2021-12-26 18:21:15 +00:00
4 changed files with 19 additions and 3 deletions
--- a/doc/source/config_template/main.yaml
+++ b/doc/source/config_template/main.yaml
@ -70,6 +70,12 @@ crawl:
    # Allowed values: positive number
    #feed_crawl_interval: 86400

+    # Minimum length of the text (in characters) extracted from
+    # a resource; resources with shorter texts are not stored.
+    # Default value: 300
+    # Allowed values: positive number
+    #min_text_length: 300
+
 # Parameters for access to the ElasticSearch service
 # No default values; must be set.
 elasticsearch:
--- a/src/atextcrawler/config.py
+++ b/src/atextcrawler/config.py
@ -278,6 +278,7 @@ schema_crawl = Schema(
        Required('resource_delay', default=5): positive_number,
        Required('full_crawl_interval', default=864000): positive_number,
        Required('feed_crawl_interval', default=86400): positive_number,
+        Required('min_text_length', default=300): positive_number,
    }
 )

--- a/src/atextcrawler/resource/operations.py
+++ b/src/atextcrawler/resource/operations.py
@ -143,7 +143,7 @@ async def process_site_path(
    """
    Fetch a path, deduplicate and if canonical, update and index the resource.

-    Return whether a new resource was handled that should contribute be
+    Return whether a new resource was handled that should contribute to
    statistics.
    """
    msg = (
@ -241,10 +241,17 @@ async def _handle_text_resource(

    # find resources similar to the current text
    text = resource.search_fields['text']
-    if len(text) < 300:  # discard resources with too short texts
-        site_path.resource_id = None
+
+    # discard resources with too short texts
+    if len(text) < app.config['crawl']['min_text_length']:
+        await site_path.unlink_resource(
+            conn,
+            app.search_engine,
+            app.config['elasticsearch']['index_base_name'],
+        )
        await site_path.save(conn)
        return False, False
+
    simhash = simhash_from_bigint(resource.simhash)
    index = site.simhash_index
    similar_ids = search_simhash(index, simhash)
--- a/src/atextcrawler/search/engine.py
+++ b/src/atextcrawler/search/engine.py
@ -53,6 +53,7 @@ properties = {
    'time_horizon': {'type': 'keyword'},
    'orig_source': {'type': 'text'},
    'topics': {'type': 'text'},
+    'length': {'type': 'integer'},
    'annotations': {'type': 'text', 'index': False},
    'sections': {
        'type': 'nested',
@ -179,6 +180,7 @@ async def index_resource(
        'time_horizon': resource.search_fields.get('time_horizon'),
        'orig_source': resource.search_fields.get('orig_source'),
        'topics': resource.search_fields.get('topics'),
+        'length': len(text),
        'annotations': pack_annotations(annotations),
        'sections': sections,
    }
Author	SHA1	Message	Date
ibu	554cf14451	Add resource length to elasticsearch index	2022-01-02 14:44:44 +00:00
ibu	f8debcff87	Make minimum text length configurable and actually remove elasticsearch documents	2021-12-26 18:21:15 +00:00