Compare commits

...

2 Commits

Author SHA1 Message Date
ibu 1440378c20 Improve comments in config_template 2021-12-08 13:00:08 +00:00
ibu 8246ce6251 Only return a SitePath if the site has crawl_enabled=true 2021-12-08 12:41:34 +00:00
2 changed files with 9 additions and 4 deletions

View File

@ -76,10 +76,10 @@ elasticsearch:
# host on which ES is running
host: localhost
# API key for accessing ES
api_key: "**********************"
api_key: "____________________"
# API user id
id: "**********************"
# Index base name (actual index names will have '_text' etc. appended)
id: "____________________"
# Index base name (full index names will have '_text_{language}' appended)
index_base_name: atext
# Tensorflow access

View File

@ -99,14 +99,19 @@ async def get_site_path(
Return the next path of a given site that needs to be processed.
If none needs to be processed, return None.
I particular, for sites having crawl_enabled=false return None.
Only return paths that have last been visited before *before*
or not been processed at all. Paths with a ok_count of -3 or lower
or not been processed at all. Paths with an ok_count of -3 or lower
are dropped.
If *only_new*, limit to paths that have not been processed at all,
irrespective of the value of *before*.
"""
sql = "SELECT crawl_enabled FROM site WHERE id=$1"
crawl_enabled = await conn.fetchval(sql, site.id_)
if not crawl_enabled:
return None
if only_new:
sql = (
"SELECT * FROM site_path"