atextcrawler/src/atextcrawler/crawl/__main__.py

139 lines
4.1 KiB
Python

"""
Run a crawl for a specifiv base_url. Use only on dev instance!
"""
import asyncio
import logging
import sys
import aiohttp
from ..config import Config
from ..db import PGPool
from ..models import Site, SitePath
from ..resource import ResourceFetcher, get_site_path, process_site_path
from ..search import shutdown_engine, startup_engine
from ..tensorflow import TensorFlow
from ..utils.similarity import get_simhash_index
from . import get_or_create_crawl
logger = logging.getLogger()
logger.setLevel(logging.WARNING)
#logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())
async def run():
"""
Run a full/feed crawl a website with given base_url, or just a path.
The 3rd argument (path) is optional.
"""
config = Config().get()
pgpool = await PGPool(config['postgresql'])
pool=pgpool.pool
try:
crawl_type = sys.argv[1]
if crawl_type not in ('full', 'feed'):
logger.error('First argument must be "full" or "feed".')
base_url = sys.argv[2]
except:
msg = (
'Please give two arguments:'
' 1) crawl type ("full" or "feed"),'
' 2) the base URL of the site to crawl'
)
logger.error(msg)
sys.exit(2)
if len(sys.argv) > 3:
path = sys.argv[3]
else:
path = None
# find site
async with pool.acquire() as conn:
sql = 'select id from site where base_url=$1'
site_id = await conn.fetchval(sql, base_url)
if site_id:
site = await Site().load(conn, site_id)
logger.warning(f'site_id: {site.id_}')
logger.warning(f'crawl_enabled: {site.crawl_enabled}')
site.simhash_index = await get_simhash_index(conn, site_id)
else:
logger.warning('Site not found')
if site_id:
if site.crawl_enabled:
await run_crawl(config, pool, site, crawl_type, path)
else:
logger.warning('Site has crawl_enabled=false')
# shutdown
await pgpool.shutdown()
class AppMock:
def __init__(self, config, search_engine):
self.config = config
self.search_engine = search_engine
class DummyModule:
def rp_filter(self, site, durl):
return durl.pwa()
self.plugins = {'filter_resource_path': DummyModule()}
async def run_crawl(config, pool, site, crawl_type, path):
session = aiohttp.ClientSession()
fetcher = ResourceFetcher(session)
tf = TensorFlow(config['tensorflow'], session)
search_engine = await startup_engine(config)
app = AppMock(config, search_engine)
async with pool.acquire() as conn:
is_full = crawl_type == 'full'
crawl = await get_or_create_crawl(conn, site.id_, is_full=is_full)
logger.warning(crawl)
if path:
sql = "SELECT * FROM site_path WHERE site_id=$1 AND path=$2"
row = await conn.fetchrow(sql, site.id_, path)
if row:
site_path = await SitePath().load_from_row(row)
await process_site_path(
app,
999,
conn,
fetcher,
tf,
site,
site_path,
)
else:
logger.error('Path does not exist in table site_path')
else:
while True:
site_path = await get_site_path(
conn,
site,
crawl.t_begin,
only_new=not crawl.is_full,
)
if not site_path:
logger.warning('Paths exhausted.')
break
logger.warning(site_path)
is_new_resource = await process_site_path(
app,
999,
conn,
fetcher,
tf,
site,
site_path,
)
logger.warning(f'Is new: {is_new_resource}')
await shutdown_engine(search_engine)
if __name__ == '__main__':
asyncio.run(run())