atextcrawler/doc/source/config_template/main.yaml

95 lines
2.7 KiB
YAML

# Name of this instance
# Default value: atextcrawler
# Allowed values: arbitrary string
instance_name: atextcrawler
# Which kind of instance is this?
# Default value: prod
# Allowed values are:
# - 'dev': development instance
# - 'staging': staging instance
# - 'prod': production instance
instance_type: prod
# Log level
# Default value: info
# Allowed values: critical, error, warning, info, debug
log_level: info
# Plugins directory
# If given as relative path, it will be relative to the
# directory of this file (main.yaml).
# Read documentation on plugins.
# Default value: plugins
# Hint: Create a empty __init__.py in the plugins_dir.
plugins_dir: plugins
# Parameters for access to the PostgreSQL service
# No default values; must be set.
postgresql:
host: localhost
port: 5432
database: atextcrawler
user: atextcrawler
password: ________________________
# Crawling
crawl:
# Number of concurrent workers
# Default value: 10
# Allowed values: integer >=0 and <=1000
workers: 10
# Delay in seconds between attempts to fetch items
# from site_queue if the last attempt gave no item
# Also the delay in seconds after a worker has found
# no site to process
# Default value: 600
# Allowed values: positive number
#site_delay: 10
# Time interval in seconds between site updates when
# handling queued base URLs
# Default value: 3600
# Allowed values: positive number
#site_revisit_interval: 3600
# Delay in seconds between attempts to process
# individual resources (pages etc.) of a site
# Default value: 5
# Allowed values: positive number
#resource_delay: 3
# Default interval in seconds between full crawls of a site
# Default value: 864000 (10 days)
# Allowed values: positive number
#full_crawl_interval: 864000
# Default interval in seconds between feed crawls of a site
# Default value: 86400 (1 day)
# Allowed values: positive number
#feed_crawl_interval: 86400
# Minimum length of the text (in characters) extracted from
# a resource; resources with shorter texts are not stored.
# Default value: 300
# Allowed values: positive number
#min_text_length: 300
# Parameters for access to the ElasticSearch service
# No default values; must be set.
elasticsearch:
# host on which ES is running
host: localhost
# API key for accessing ES
api_key: "____________________"
# API user id
id: "____________________"
# Index base name (full index names will have '_text_{language}' appended)
index_base_name: atext
# Tensorflow access
tensorflow:
# The prediction endpoint of the model server's sentence model
model_server_endpoint: http://localhost:9000/v1/models/sentences:predict