339 lines
10 KiB
Python
339 lines
10 KiB
Python
"""
|
|
Configuration loader and validator.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
from io import TextIOBase
|
|
from pathlib import Path
|
|
from typing import Any, Optional, Union
|
|
|
|
from voluptuous import All
|
|
from voluptuous import Any as VAny
|
|
from voluptuous import Invalid, Length, Range, Required, Schema, Url
|
|
from yaml import load
|
|
|
|
try:
|
|
from yaml import CLoader as Loader # type: ignore
|
|
except ImportError:
|
|
from yaml import Loader # type: ignore
|
|
|
|
|
|
class ConfigError(Exception):
|
|
"""
|
|
Application configuration error.
|
|
"""
|
|
|
|
def __init__(self, err):
|
|
self.msg = str(err)
|
|
|
|
def __str__(self):
|
|
return f'Application configuration error: {self.msg}'
|
|
|
|
|
|
class Config:
|
|
"""
|
|
Application configuration.
|
|
|
|
Access the full application configuration using :meth:`get`.
|
|
|
|
It is a dictionary with these keys:
|
|
|
|
* 'directory': the configuration directory being used
|
|
* 'main': the main configuration from main.yaml, but
|
|
postgresql configuration may be overriden by environment
|
|
variable ATEXTCRAWLER_POSTGRESQL
|
|
"""
|
|
|
|
config = None
|
|
|
|
@classmethod
|
|
def get(
|
|
cls,
|
|
out: Optional[TextIOBase] = None,
|
|
) -> Optional[dict]:
|
|
"""
|
|
Load and validate app configuration if not already done; return it.
|
|
|
|
On errors print them to *out* and if out is sys.stdout, then
|
|
also exit with exit code 2. Otherwise just return None.
|
|
"""
|
|
if cls.config:
|
|
return cls.config
|
|
if out is None:
|
|
out = sys.stdout # type: ignore
|
|
_config = _load_config()
|
|
msg = None
|
|
if isinstance(_config, ConfigError):
|
|
msg = f'ERROR: configuration could not be loaded: {_config}'
|
|
else:
|
|
config = _validate_config(_config)
|
|
if isinstance(config, ConfigError):
|
|
config_dir = _config.get('config_dir')
|
|
msg = (
|
|
f'ERROR: invalid configuration in {config_dir}:'
|
|
f' {config}'
|
|
)
|
|
if isinstance(_config, ConfigError) or isinstance(config, ConfigError):
|
|
print(msg, file=out)
|
|
if out == sys.stdout:
|
|
sys.exit(2)
|
|
else:
|
|
return None
|
|
config['postgresql']['min_size'] = config['crawl']['workers'] + 2
|
|
config['postgresql']['max_size'] = config['crawl']['workers'] + 2
|
|
cls.config = config
|
|
return config
|
|
|
|
|
|
def _load_config() -> Union[ConfigError, dict]:
|
|
"""
|
|
Load configuration; search in multiple directories.
|
|
|
|
We search these locations; the first location containing main.yaml
|
|
will be used::
|
|
|
|
* a directory defined in environment variable ATEXTCRAWLER_CONF
|
|
* subdir .config/atextcrawler in the user's home (`$HOME`)
|
|
* /etc/atextcrawler
|
|
|
|
In the same directory where this main.conf is located a subdirectory
|
|
'plugins' must exist and contain the configurations of plugins.
|
|
|
|
On failure return the first error and None.
|
|
Otherwise return None and a dict with these keys:
|
|
|
|
* `directory`: the used configuration directory
|
|
* `main`: the main application configuration
|
|
* `plugins`: a dict mapping plugins names to plugin configurations
|
|
"""
|
|
Path(__file__).parent.parent
|
|
config_dirs = []
|
|
if env_conf := os.environ.get('ATEXTCRAWLER_CONFIG_DIR'):
|
|
config_dirs.append(Path(env_conf))
|
|
if env_home := os.environ.get('HOME'):
|
|
config_dirs.append(Path(env_home) / '.config' / 'atextcrawler')
|
|
config_dirs.append(Path('/etc/atextcrawler'))
|
|
for config_dir in config_dirs:
|
|
main_yaml_path = config_dir / 'main.yaml'
|
|
if main_yaml_path.exists():
|
|
break
|
|
else:
|
|
locs = ', '.join([str(loc) for loc in config_dirs if loc])
|
|
msg = (
|
|
f'Missing main.yaml in all config locations: {locs}\n'
|
|
f'Hint: You may use environment variable ATEXTCRAWLER_CONFIG_DIR'
|
|
f' to define a custom config directory.'
|
|
)
|
|
return ConfigError(msg)
|
|
|
|
# load main.yaml
|
|
try:
|
|
with main_yaml_path.open() as main_yaml:
|
|
main_config = load(main_yaml.read(), Loader=Loader)
|
|
except Exception as err:
|
|
return ConfigError(f'Invalid YAML in {main_yaml_path}:\n {err}')
|
|
|
|
# main_config must be a dict
|
|
if not isinstance(main_config, dict):
|
|
return ConfigError(f'File {main_yaml_path} must contain a dictionary')
|
|
|
|
# postgresql config from environment has precedence
|
|
postgresql_config = _get_env_postgresql()
|
|
if isinstance(postgresql_config, ConfigError):
|
|
return postgresql_config
|
|
main_config['postgresql'] = postgresql_config or main_config['postgresql']
|
|
|
|
main_config['config_dir'] = str(config_dir)
|
|
return main_config
|
|
|
|
|
|
def _get_env_postgresql() -> Union[ConfigError, Optional[dict]]:
|
|
"""
|
|
Load PostgreSQL config from environment variable ATEXTCRAWLER_POSTGRESQL.
|
|
|
|
Return an error or the PostgreSQL config (which can be None if
|
|
the environment variable is not defined.
|
|
"""
|
|
env_var = 'ATEXTCRAWLER_POSTGRESQL'
|
|
value = os.environ.get(env_var, '').strip()
|
|
if not value:
|
|
return None
|
|
param_names = (
|
|
'host',
|
|
'port',
|
|
'database',
|
|
'user',
|
|
'password',
|
|
'schema_name',
|
|
)
|
|
re_dsn = re.compile(
|
|
'((' + '|'.join(param_names) + ')'
|
|
'=("(((?=[^"\\\\]).|\\\\.)*)"' # value in double quotes
|
|
'|\'(((?=[^\'\\\\]).|\\\\.)*)\'' # value in single quotes
|
|
'|([^"\' ]*)' # value unquoted
|
|
')( |$))+?'
|
|
)
|
|
params = {}
|
|
for _, varname, _, v1, _, v2, _, v3, _ in re_dsn.findall(value):
|
|
params[varname] = (
|
|
v3
|
|
or (v1 or '').replace('\\"', '"')
|
|
or (v2 or '').replace("\\'", "'")
|
|
)
|
|
if 'host' not in params:
|
|
params['host'] = 'localhost'
|
|
if 'port' not in params:
|
|
params['port'] = '5432'
|
|
if 'schema_name' not in params:
|
|
params['schema_name'] = 'public'
|
|
for name in param_names:
|
|
if name not in params:
|
|
return ConfigError(
|
|
f'Missing {name} in environment variable {env_var}'
|
|
)
|
|
else:
|
|
params['port'] = int(params['port'])
|
|
return params
|
|
|
|
|
|
def _validate_config(config: Any) -> Union[ConfigError, dict]:
|
|
"""
|
|
Validate the given configuration and fill in default values.
|
|
|
|
If invalid, return only the first error.
|
|
Otherwise return the configuration with added default values.
|
|
"""
|
|
try:
|
|
return schema_main(config)
|
|
except Exception as err:
|
|
return ConfigError(err)
|
|
|
|
|
|
def plugins_dir(config):
|
|
"""
|
|
Validate plugins directory (absolute or relative path).
|
|
|
|
If it is a relative path, prepend the config_dir.
|
|
"""
|
|
config_dir = config['config_dir']
|
|
plugins_dir = config['plugins_dir']
|
|
if plugins_dir.startswith('/'):
|
|
try:
|
|
plugins_dir = Path(plugins_dir)
|
|
except:
|
|
raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found')
|
|
else:
|
|
try:
|
|
plugins_dir = str(Path(config_dir) / Path(plugins_dir))
|
|
config['plugins_dir'] = plugins_dir
|
|
except:
|
|
raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found')
|
|
if not (Path(plugins_dir) / '__init__.py').exists():
|
|
raise Invalid(f'plugins_dir "{plugins_dir}" has no "__init__.py"')
|
|
return config
|
|
|
|
|
|
def postgresql_identifier(value):
|
|
"""
|
|
Validate a PostgreSQL identifier.
|
|
"""
|
|
if not isinstance(value, str) or not re.match(
|
|
'^[a-z][a-z0-9_]{0,30}$', value
|
|
):
|
|
raise Invalid(
|
|
f'Invalid PostgreSQL identifier "{value}", '
|
|
f'pattern must be: [a-z][a-z0-9_]{0,30}'
|
|
)
|
|
return value
|
|
|
|
|
|
def positive_number(value):
|
|
"""
|
|
Validate a positive number (int or float).
|
|
"""
|
|
if (isinstance(value, int) or isinstance(value, float)) and value > 0:
|
|
return value
|
|
raise Invalid('Not a positive number')
|
|
|
|
|
|
schema_postgresql = Schema(
|
|
{
|
|
Required('host'): All(str, Length(min=1)),
|
|
Required('port', default=5432): All(int, Range(min=0, max=65535)),
|
|
Required('database'): All(str, Length(min=1)),
|
|
Required('user'): All(str, Length(min=1)),
|
|
Required('password'): str,
|
|
Required('schema_name', default='public'): postgresql_identifier,
|
|
}
|
|
)
|
|
|
|
|
|
schema_crawl = Schema(
|
|
{
|
|
Required('workers', default=10): All(int, Range(min=0, max=1000)),
|
|
Required('site_delay', default=600): positive_number,
|
|
Required('site_revisit_interval', default=3600): positive_number,
|
|
Required('resource_delay', default=5): positive_number,
|
|
Required('full_crawl_interval', default=864000): positive_number,
|
|
Required('feed_crawl_interval', default=86400): positive_number,
|
|
Required('min_text_length', default=300): positive_number,
|
|
}
|
|
)
|
|
|
|
|
|
schema_elasticsearch = Schema(
|
|
{
|
|
Required('host'): All(str, Length(min=1)),
|
|
Required('api_key'): All(str, Length(min=1)),
|
|
Required('id'): All(str, Length(min=1)),
|
|
Required('index_base_name'): All(str, Length(min=1)),
|
|
}
|
|
)
|
|
|
|
|
|
schema_tensorflow = Schema(
|
|
{
|
|
Required('model_server_endpoint'): Url(),
|
|
}
|
|
)
|
|
|
|
|
|
schema_main = Schema(
|
|
All(
|
|
{
|
|
Required('config_dir'): All(str, Length(min=1)),
|
|
Required(
|
|
'instance_name', default='atextcrawler'
|
|
): postgresql_identifier,
|
|
Required('instance_type', default='prod'): VAny(
|
|
'dev',
|
|
'staging',
|
|
'prod',
|
|
),
|
|
Required('log_level', default='info'): VAny(
|
|
'critical',
|
|
'error',
|
|
'warning',
|
|
'info',
|
|
'debug',
|
|
),
|
|
Required('plugins_dir', default='plugins'): All(
|
|
str, Length(min=1)
|
|
),
|
|
Required('postgresql'): schema_postgresql,
|
|
Required('crawl'): schema_crawl,
|
|
Required('elasticsearch'): schema_elasticsearch,
|
|
Required('tensorflow'): schema_tensorflow,
|
|
},
|
|
plugins_dir,
|
|
)
|
|
)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
from pprint import pprint
|
|
|
|
pprint(Config().get())
|