# Name of this instance # Default value: atextcrawler # Allowed values: arbitrary string instance_name: atextcrawler # Which kind of instance is this? # Default value: prod # Allowed values are: # - 'dev': development instance # - 'staging': staging instance # - 'prod': production instance instance_type: prod # Log level # Default value: info # Allowed values: critical, error, warning, info, debug log_level: info # Plugins directory # If given as relative path, it will be relative to the # directory of this file (main.yaml). # Read documentation on plugins. # Default value: plugins # Hint: Create a empty __init__.py in the plugins_dir. plugins_dir: plugins # Parameters for access to the PostgreSQL service # No default values; must be set. postgresql: host: localhost port: 5432 database: atextcrawler user: atextcrawler password: ________________________ # Crawling crawl: # Number of concurrent workers # Default value: 10 # Allowed values: integer >=0 and <=1000 workers: 10 # Delay in seconds between attempts to fetch items # from site_queue if the last attempt gave no item # Also the delay in seconds after a worker has found # no site to process # Default value: 600 # Allowed values: positive number #site_delay: 10 # Time interval in seconds between site updates when # handling queued base URLs # Default value: 3600 # Allowed values: positive number #site_revisit_interval: 3600 # Delay in seconds between attempts to process # individual resources (pages etc.) of a site # Default value: 5 # Allowed values: positive number #resource_delay: 3 # Default interval in seconds between full crawls of a site # Default value: 864000 (10 days) # Allowed values: positive number #full_crawl_interval: 864000 # Default interval in seconds between feed crawls of a site # Default value: 86400 (1 day) # Allowed values: positive number #feed_crawl_interval: 86400 # Minimum length of the text (in characters) extracted from # a resource; resources with shorter texts are not stored. # Default value: 300 # Allowed values: positive number #min_text_length: 300 # Parameters for access to the ElasticSearch service # No default values; must be set. elasticsearch: # host on which ES is running host: localhost # API key for accessing ES api_key: "____________________" # API user id id: "____________________" # Index base name (full index names will have '_text_{language}' appended) index_base_name: atext # Tensorflow access tensorflow: # The prediction endpoint of the model server's sentence model model_server_endpoint: http://localhost:9000/v1/models/sentences:predict