diff --git a/src/atextcrawler/resource/operations.py b/src/atextcrawler/resource/operations.py index dffe2bc..078a668 100644 --- a/src/atextcrawler/resource/operations.py +++ b/src/atextcrawler/resource/operations.py @@ -99,14 +99,19 @@ async def get_site_path( Return the next path of a given site that needs to be processed. If none needs to be processed, return None. + I particular, for sites having crawl_enabled=false return None. Only return paths that have last been visited before *before* - or not been processed at all. Paths with a ok_count of -3 or lower + or not been processed at all. Paths with an ok_count of -3 or lower are dropped. If *only_new*, limit to paths that have not been processed at all, irrespective of the value of *before*. """ + sql = "SELECT crawl_enabled FROM site WHERE id=$1" + crawl_enabled = await conn.fetchval(sql, site.id_) + if not crawl_enabled: + return None if only_new: sql = ( "SELECT * FROM site_path"