139 lines
4.1 KiB
Python
139 lines
4.1 KiB
Python
"""
|
|
Run a crawl for a specifiv base_url. Use only on dev instance!
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import sys
|
|
|
|
import aiohttp
|
|
|
|
from ..config import Config
|
|
from ..db import PGPool
|
|
from ..models import Site, SitePath
|
|
from ..resource import ResourceFetcher, get_site_path, process_site_path
|
|
from ..search import shutdown_engine, startup_engine
|
|
from ..tensorflow import TensorFlow
|
|
from ..utils.similarity import get_simhash_index
|
|
from . import get_or_create_crawl
|
|
|
|
logger = logging.getLogger()
|
|
logger.setLevel(logging.WARNING)
|
|
#logger.setLevel(logging.DEBUG)
|
|
logger.addHandler(logging.StreamHandler())
|
|
|
|
|
|
async def run():
|
|
"""
|
|
Run a full/feed crawl a website with given base_url, or just a path.
|
|
|
|
The 3rd argument (path) is optional.
|
|
"""
|
|
config = Config().get()
|
|
pgpool = await PGPool(config['postgresql'])
|
|
pool=pgpool.pool
|
|
|
|
try:
|
|
crawl_type = sys.argv[1]
|
|
if crawl_type not in ('full', 'feed'):
|
|
logger.error('First argument must be "full" or "feed".')
|
|
base_url = sys.argv[2]
|
|
except:
|
|
msg = (
|
|
'Please give two arguments:'
|
|
' 1) crawl type ("full" or "feed"),'
|
|
' 2) the base URL of the site to crawl'
|
|
)
|
|
logger.error(msg)
|
|
sys.exit(2)
|
|
if len(sys.argv) > 3:
|
|
path = sys.argv[3]
|
|
else:
|
|
path = None
|
|
|
|
# find site
|
|
async with pool.acquire() as conn:
|
|
sql = 'select id from site where base_url=$1'
|
|
site_id = await conn.fetchval(sql, base_url)
|
|
if site_id:
|
|
site = await Site().load(conn, site_id)
|
|
logger.warning(f'site_id: {site.id_}')
|
|
logger.warning(f'crawl_enabled: {site.crawl_enabled}')
|
|
site.simhash_index = await get_simhash_index(conn, site_id)
|
|
else:
|
|
logger.warning('Site not found')
|
|
|
|
if site_id:
|
|
if site.crawl_enabled:
|
|
await run_crawl(config, pool, site, crawl_type, path)
|
|
else:
|
|
logger.warning('Site has crawl_enabled=false')
|
|
|
|
# shutdown
|
|
await pgpool.shutdown()
|
|
|
|
|
|
class AppMock:
|
|
def __init__(self, config, search_engine):
|
|
self.config = config
|
|
self.search_engine = search_engine
|
|
class DummyModule:
|
|
def rp_filter(self, site, durl):
|
|
return durl.pwa()
|
|
self.plugins = {'filter_resource_path': DummyModule()}
|
|
|
|
|
|
async def run_crawl(config, pool, site, crawl_type, path):
|
|
session = aiohttp.ClientSession()
|
|
fetcher = ResourceFetcher(session)
|
|
tf = TensorFlow(config['tensorflow'], session)
|
|
search_engine = await startup_engine(config)
|
|
app = AppMock(config, search_engine)
|
|
async with pool.acquire() as conn:
|
|
is_full = crawl_type == 'full'
|
|
crawl = await get_or_create_crawl(conn, site.id_, is_full=is_full)
|
|
logger.warning(crawl)
|
|
if path:
|
|
sql = "SELECT * FROM site_path WHERE site_id=$1 AND path=$2"
|
|
row = await conn.fetchrow(sql, site.id_, path)
|
|
if row:
|
|
site_path = await SitePath().load_from_row(row)
|
|
await process_site_path(
|
|
app,
|
|
999,
|
|
conn,
|
|
fetcher,
|
|
tf,
|
|
site,
|
|
site_path,
|
|
)
|
|
else:
|
|
logger.error('Path does not exist in table site_path')
|
|
else:
|
|
while True:
|
|
site_path = await get_site_path(
|
|
conn,
|
|
site,
|
|
crawl.t_begin,
|
|
only_new=not crawl.is_full,
|
|
)
|
|
if not site_path:
|
|
logger.warning('Paths exhausted.')
|
|
break
|
|
logger.warning(site_path)
|
|
is_new_resource = await process_site_path(
|
|
app,
|
|
999,
|
|
conn,
|
|
fetcher,
|
|
tf,
|
|
site,
|
|
site_path,
|
|
)
|
|
logger.warning(f'Is new: {is_new_resource}')
|
|
await shutdown_engine(search_engine)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(run())
|