atextcrawler/src/atextcrawler/config.py

339 lines
10 KiB
Python

"""
Configuration loader and validator.
"""
import os
import re
import sys
from io import TextIOBase
from pathlib import Path
from typing import Any, Optional, Union
from voluptuous import All
from voluptuous import Any as VAny
from voluptuous import Invalid, Length, Range, Required, Schema, Url
from yaml import load
try:
from yaml import CLoader as Loader # type: ignore
except ImportError:
from yaml import Loader # type: ignore
class ConfigError(Exception):
"""
Application configuration error.
"""
def __init__(self, err):
self.msg = str(err)
def __str__(self):
return f'Application configuration error: {self.msg}'
class Config:
"""
Application configuration.
Access the full application configuration using :meth:`get`.
It is a dictionary with these keys:
* 'directory': the configuration directory being used
* 'main': the main configuration from main.yaml, but
postgresql configuration may be overriden by environment
variable ATEXTCRAWLER_POSTGRESQL
"""
config = None
@classmethod
def get(
cls,
out: Optional[TextIOBase] = None,
) -> Optional[dict]:
"""
Load and validate app configuration if not already done; return it.
On errors print them to *out* and if out is sys.stdout, then
also exit with exit code 2. Otherwise just return None.
"""
if cls.config:
return cls.config
if out is None:
out = sys.stdout # type: ignore
_config = _load_config()
msg = None
if isinstance(_config, ConfigError):
msg = f'ERROR: configuration could not be loaded: {_config}'
else:
config = _validate_config(_config)
if isinstance(config, ConfigError):
config_dir = _config.get('config_dir')
msg = (
f'ERROR: invalid configuration in {config_dir}:'
f' {config}'
)
if isinstance(_config, ConfigError) or isinstance(config, ConfigError):
print(msg, file=out)
if out == sys.stdout:
sys.exit(2)
else:
return None
config['postgresql']['min_size'] = config['crawl']['workers'] + 2
config['postgresql']['max_size'] = config['crawl']['workers'] + 2
cls.config = config
return config
def _load_config() -> Union[ConfigError, dict]:
"""
Load configuration; search in multiple directories.
We search these locations; the first location containing main.yaml
will be used::
* a directory defined in environment variable ATEXTCRAWLER_CONF
* subdir .config/atextcrawler in the user's home (`$HOME`)
* /etc/atextcrawler
In the same directory where this main.conf is located a subdirectory
'plugins' must exist and contain the configurations of plugins.
On failure return the first error and None.
Otherwise return None and a dict with these keys:
* `directory`: the used configuration directory
* `main`: the main application configuration
* `plugins`: a dict mapping plugins names to plugin configurations
"""
Path(__file__).parent.parent
config_dirs = []
if env_conf := os.environ.get('ATEXTCRAWLER_CONFIG_DIR'):
config_dirs.append(Path(env_conf))
if env_home := os.environ.get('HOME'):
config_dirs.append(Path(env_home) / '.config' / 'atextcrawler')
config_dirs.append(Path('/etc/atextcrawler'))
for config_dir in config_dirs:
main_yaml_path = config_dir / 'main.yaml'
if main_yaml_path.exists():
break
else:
locs = ', '.join([str(loc) for loc in config_dirs if loc])
msg = (
f'Missing main.yaml in all config locations: {locs}\n'
f'Hint: You may use environment variable ATEXTCRAWLER_CONFIG_DIR'
f' to define a custom config directory.'
)
return ConfigError(msg)
# load main.yaml
try:
with main_yaml_path.open() as main_yaml:
main_config = load(main_yaml.read(), Loader=Loader)
except Exception as err:
return ConfigError(f'Invalid YAML in {main_yaml_path}:\n {err}')
# main_config must be a dict
if not isinstance(main_config, dict):
return ConfigError(f'File {main_yaml_path} must contain a dictionary')
# postgresql config from environment has precedence
postgresql_config = _get_env_postgresql()
if isinstance(postgresql_config, ConfigError):
return postgresql_config
main_config['postgresql'] = postgresql_config or main_config['postgresql']
main_config['config_dir'] = str(config_dir)
return main_config
def _get_env_postgresql() -> Union[ConfigError, Optional[dict]]:
"""
Load PostgreSQL config from environment variable ATEXTCRAWLER_POSTGRESQL.
Return an error or the PostgreSQL config (which can be None if
the environment variable is not defined.
"""
env_var = 'ATEXTCRAWLER_POSTGRESQL'
value = os.environ.get(env_var, '').strip()
if not value:
return None
param_names = (
'host',
'port',
'database',
'user',
'password',
'schema_name',
)
re_dsn = re.compile(
'((' + '|'.join(param_names) + ')'
'=("(((?=[^"\\\\]).|\\\\.)*)"' # value in double quotes
'|\'(((?=[^\'\\\\]).|\\\\.)*)\'' # value in single quotes
'|([^"\' ]*)' # value unquoted
')( |$))+?'
)
params = {}
for _, varname, _, v1, _, v2, _, v3, _ in re_dsn.findall(value):
params[varname] = (
v3
or (v1 or '').replace('\\"', '"')
or (v2 or '').replace("\\'", "'")
)
if 'host' not in params:
params['host'] = 'localhost'
if 'port' not in params:
params['port'] = '5432'
if 'schema_name' not in params:
params['schema_name'] = 'public'
for name in param_names:
if name not in params:
return ConfigError(
f'Missing {name} in environment variable {env_var}'
)
else:
params['port'] = int(params['port'])
return params
def _validate_config(config: Any) -> Union[ConfigError, dict]:
"""
Validate the given configuration and fill in default values.
If invalid, return only the first error.
Otherwise return the configuration with added default values.
"""
try:
return schema_main(config)
except Exception as err:
return ConfigError(err)
def plugins_dir(config):
"""
Validate plugins directory (absolute or relative path).
If it is a relative path, prepend the config_dir.
"""
config_dir = config['config_dir']
plugins_dir = config['plugins_dir']
if plugins_dir.startswith('/'):
try:
plugins_dir = Path(plugins_dir)
except:
raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found')
else:
try:
plugins_dir = str(Path(config_dir) / Path(plugins_dir))
config['plugins_dir'] = plugins_dir
except:
raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found')
if not (Path(plugins_dir) / '__init__.py').exists():
raise Invalid(f'plugins_dir "{plugins_dir}" has no "__init__.py"')
return config
def postgresql_identifier(value):
"""
Validate a PostgreSQL identifier.
"""
if not isinstance(value, str) or not re.match(
'^[a-z][a-z0-9_]{0,30}$', value
):
raise Invalid(
f'Invalid PostgreSQL identifier "{value}", '
f'pattern must be: [a-z][a-z0-9_]{0,30}'
)
return value
def positive_number(value):
"""
Validate a positive number (int or float).
"""
if (isinstance(value, int) or isinstance(value, float)) and value > 0:
return value
raise Invalid('Not a positive number')
schema_postgresql = Schema(
{
Required('host'): All(str, Length(min=1)),
Required('port', default=5432): All(int, Range(min=0, max=65535)),
Required('database'): All(str, Length(min=1)),
Required('user'): All(str, Length(min=1)),
Required('password'): str,
Required('schema_name', default='public'): postgresql_identifier,
}
)
schema_crawl = Schema(
{
Required('workers', default=10): All(int, Range(min=0, max=1000)),
Required('site_delay', default=600): positive_number,
Required('site_revisit_interval', default=3600): positive_number,
Required('resource_delay', default=5): positive_number,
Required('full_crawl_interval', default=864000): positive_number,
Required('feed_crawl_interval', default=86400): positive_number,
Required('min_text_length', default=300): positive_number,
}
)
schema_elasticsearch = Schema(
{
Required('host'): All(str, Length(min=1)),
Required('api_key'): All(str, Length(min=1)),
Required('id'): All(str, Length(min=1)),
Required('index_base_name'): All(str, Length(min=1)),
}
)
schema_tensorflow = Schema(
{
Required('model_server_endpoint'): Url(),
}
)
schema_main = Schema(
All(
{
Required('config_dir'): All(str, Length(min=1)),
Required(
'instance_name', default='atextcrawler'
): postgresql_identifier,
Required('instance_type', default='prod'): VAny(
'dev',
'staging',
'prod',
),
Required('log_level', default='info'): VAny(
'critical',
'error',
'warning',
'info',
'debug',
),
Required('plugins_dir', default='plugins'): All(
str, Length(min=1)
),
Required('postgresql'): schema_postgresql,
Required('crawl'): schema_crawl,
Required('elasticsearch'): schema_elasticsearch,
Required('tensorflow'): schema_tensorflow,
},
plugins_dir,
)
)
if __name__ == '__main__':
from pprint import pprint
pprint(Config().get())