Compare commits
2 Commits
08ca7fee56
...
554cf14451
Author | SHA1 | Date |
---|---|---|
ibu | 554cf14451 | |
ibu | f8debcff87 |
|
@ -70,6 +70,12 @@ crawl:
|
|||
# Allowed values: positive number
|
||||
#feed_crawl_interval: 86400
|
||||
|
||||
# Minimum length of the text (in characters) extracted from
|
||||
# a resource; resources with shorter texts are not stored.
|
||||
# Default value: 300
|
||||
# Allowed values: positive number
|
||||
#min_text_length: 300
|
||||
|
||||
# Parameters for access to the ElasticSearch service
|
||||
# No default values; must be set.
|
||||
elasticsearch:
|
||||
|
|
|
@ -278,6 +278,7 @@ schema_crawl = Schema(
|
|||
Required('resource_delay', default=5): positive_number,
|
||||
Required('full_crawl_interval', default=864000): positive_number,
|
||||
Required('feed_crawl_interval', default=86400): positive_number,
|
||||
Required('min_text_length', default=300): positive_number,
|
||||
}
|
||||
)
|
||||
|
||||
|
|
|
@ -143,7 +143,7 @@ async def process_site_path(
|
|||
"""
|
||||
Fetch a path, deduplicate and if canonical, update and index the resource.
|
||||
|
||||
Return whether a new resource was handled that should contribute be
|
||||
Return whether a new resource was handled that should contribute to
|
||||
statistics.
|
||||
"""
|
||||
msg = (
|
||||
|
@ -241,10 +241,17 @@ async def _handle_text_resource(
|
|||
|
||||
# find resources similar to the current text
|
||||
text = resource.search_fields['text']
|
||||
if len(text) < 300: # discard resources with too short texts
|
||||
site_path.resource_id = None
|
||||
|
||||
# discard resources with too short texts
|
||||
if len(text) < app.config['crawl']['min_text_length']:
|
||||
await site_path.unlink_resource(
|
||||
conn,
|
||||
app.search_engine,
|
||||
app.config['elasticsearch']['index_base_name'],
|
||||
)
|
||||
await site_path.save(conn)
|
||||
return False, False
|
||||
|
||||
simhash = simhash_from_bigint(resource.simhash)
|
||||
index = site.simhash_index
|
||||
similar_ids = search_simhash(index, simhash)
|
||||
|
|
|
@ -53,6 +53,7 @@ properties = {
|
|||
'time_horizon': {'type': 'keyword'},
|
||||
'orig_source': {'type': 'text'},
|
||||
'topics': {'type': 'text'},
|
||||
'length': {'type': 'integer'},
|
||||
'annotations': {'type': 'text', 'index': False},
|
||||
'sections': {
|
||||
'type': 'nested',
|
||||
|
@ -179,6 +180,7 @@ async def index_resource(
|
|||
'time_horizon': resource.search_fields.get('time_horizon'),
|
||||
'orig_source': resource.search_fields.get('orig_source'),
|
||||
'topics': resource.search_fields.get('topics'),
|
||||
'length': len(text),
|
||||
'annotations': pack_annotations(annotations),
|
||||
'sections': sections,
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue