""" Configuration loader and validator. """ import os import re import sys from io import TextIOBase from pathlib import Path from typing import Any, Optional, Union from voluptuous import All from voluptuous import Any as VAny from voluptuous import Invalid, Length, Range, Required, Schema, Url from yaml import load try: from yaml import CLoader as Loader # type: ignore except ImportError: from yaml import Loader # type: ignore class ConfigError(Exception): """ Application configuration error. """ def __init__(self, err): self.msg = str(err) def __str__(self): return f'Application configuration error: {self.msg}' class Config: """ Application configuration. Access the full application configuration using :meth:`get`. It is a dictionary with these keys: * 'directory': the configuration directory being used * 'main': the main configuration from main.yaml, but postgresql configuration may be overriden by environment variable ATEXTCRAWLER_POSTGRESQL """ config = None @classmethod def get( cls, out: Optional[TextIOBase] = None, ) -> Optional[dict]: """ Load and validate app configuration if not already done; return it. On errors print them to *out* and if out is sys.stdout, then also exit with exit code 2. Otherwise just return None. """ if cls.config: return cls.config if out is None: out = sys.stdout # type: ignore _config = _load_config() msg = None if isinstance(_config, ConfigError): msg = f'ERROR: configuration could not be loaded: {_config}' else: config = _validate_config(_config) if isinstance(config, ConfigError): config_dir = _config.get('config_dir') msg = ( f'ERROR: invalid configuration in {config_dir}:' f' {config}' ) if isinstance(_config, ConfigError) or isinstance(config, ConfigError): print(msg, file=out) if out == sys.stdout: sys.exit(2) else: return None config['postgresql']['min_size'] = config['crawl']['workers'] + 2 config['postgresql']['max_size'] = config['crawl']['workers'] + 2 cls.config = config return config def _load_config() -> Union[ConfigError, dict]: """ Load configuration; search in multiple directories. We search these locations; the first location containing main.yaml will be used:: * a directory defined in environment variable ATEXTCRAWLER_CONF * subdir .config/atextcrawler in the user's home (`$HOME`) * /etc/atextcrawler In the same directory where this main.conf is located a subdirectory 'plugins' must exist and contain the configurations of plugins. On failure return the first error and None. Otherwise return None and a dict with these keys: * `directory`: the used configuration directory * `main`: the main application configuration * `plugins`: a dict mapping plugins names to plugin configurations """ Path(__file__).parent.parent config_dirs = [] if env_conf := os.environ.get('ATEXTCRAWLER_CONFIG_DIR'): config_dirs.append(Path(env_conf)) if env_home := os.environ.get('HOME'): config_dirs.append(Path(env_home) / '.config' / 'atextcrawler') config_dirs.append(Path('/etc/atextcrawler')) for config_dir in config_dirs: main_yaml_path = config_dir / 'main.yaml' if main_yaml_path.exists(): break else: locs = ', '.join([str(loc) for loc in config_dirs if loc]) msg = ( f'Missing main.yaml in all config locations: {locs}\n' f'Hint: You may use environment variable ATEXTCRAWLER_CONFIG_DIR' f' to define a custom config directory.' ) return ConfigError(msg) # load main.yaml try: with main_yaml_path.open() as main_yaml: main_config = load(main_yaml.read(), Loader=Loader) except Exception as err: return ConfigError(f'Invalid YAML in {main_yaml_path}:\n {err}') # main_config must be a dict if not isinstance(main_config, dict): return ConfigError(f'File {main_yaml_path} must contain a dictionary') # postgresql config from environment has precedence postgresql_config = _get_env_postgresql() if isinstance(postgresql_config, ConfigError): return postgresql_config main_config['postgresql'] = postgresql_config or main_config['postgresql'] main_config['config_dir'] = str(config_dir) return main_config def _get_env_postgresql() -> Union[ConfigError, Optional[dict]]: """ Load PostgreSQL config from environment variable ATEXTCRAWLER_POSTGRESQL. Return an error or the PostgreSQL config (which can be None if the environment variable is not defined. """ env_var = 'ATEXTCRAWLER_POSTGRESQL' value = os.environ.get(env_var, '').strip() if not value: return None param_names = ( 'host', 'port', 'database', 'user', 'password', 'schema_name', ) re_dsn = re.compile( '((' + '|'.join(param_names) + ')' '=("(((?=[^"\\\\]).|\\\\.)*)"' # value in double quotes '|\'(((?=[^\'\\\\]).|\\\\.)*)\'' # value in single quotes '|([^"\' ]*)' # value unquoted ')( |$))+?' ) params = {} for _, varname, _, v1, _, v2, _, v3, _ in re_dsn.findall(value): params[varname] = ( v3 or (v1 or '').replace('\\"', '"') or (v2 or '').replace("\\'", "'") ) if 'host' not in params: params['host'] = 'localhost' if 'port' not in params: params['port'] = '5432' if 'schema_name' not in params: params['schema_name'] = 'public' for name in param_names: if name not in params: return ConfigError( f'Missing {name} in environment variable {env_var}' ) else: params['port'] = int(params['port']) return params def _validate_config(config: Any) -> Union[ConfigError, dict]: """ Validate the given configuration and fill in default values. If invalid, return only the first error. Otherwise return the configuration with added default values. """ try: return schema_main(config) except Exception as err: return ConfigError(err) def plugins_dir(config): """ Validate plugins directory (absolute or relative path). If it is a relative path, prepend the config_dir. """ config_dir = config['config_dir'] plugins_dir = config['plugins_dir'] if plugins_dir.startswith('/'): try: plugins_dir = Path(plugins_dir) except: raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found') else: try: plugins_dir = str(Path(config_dir) / Path(plugins_dir)) config['plugins_dir'] = plugins_dir except: raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found') if not (Path(plugins_dir) / '__init__.py').exists(): raise Invalid(f'plugins_dir "{plugins_dir}" has no "__init__.py"') return config def postgresql_identifier(value): """ Validate a PostgreSQL identifier. """ if not isinstance(value, str) or not re.match( '^[a-z][a-z0-9_]{0,30}$', value ): raise Invalid( f'Invalid PostgreSQL identifier "{value}", ' f'pattern must be: [a-z][a-z0-9_]{0,30}' ) return value def positive_number(value): """ Validate a positive number (int or float). """ if (isinstance(value, int) or isinstance(value, float)) and value > 0: return value raise Invalid('Not a positive number') schema_postgresql = Schema( { Required('host'): All(str, Length(min=1)), Required('port', default=5432): All(int, Range(min=0, max=65535)), Required('database'): All(str, Length(min=1)), Required('user'): All(str, Length(min=1)), Required('password'): str, Required('schema_name', default='public'): postgresql_identifier, } ) schema_crawl = Schema( { Required('workers', default=10): All(int, Range(min=0, max=1000)), Required('site_delay', default=600): positive_number, Required('site_revisit_interval', default=3600): positive_number, Required('resource_delay', default=5): positive_number, Required('full_crawl_interval', default=864000): positive_number, Required('feed_crawl_interval', default=86400): positive_number, Required('min_text_length', default=300): positive_number, } ) schema_elasticsearch = Schema( { Required('host'): All(str, Length(min=1)), Required('api_key'): All(str, Length(min=1)), Required('id'): All(str, Length(min=1)), Required('index_base_name'): All(str, Length(min=1)), } ) schema_tensorflow = Schema( { Required('model_server_endpoint'): Url(), } ) schema_main = Schema( All( { Required('config_dir'): All(str, Length(min=1)), Required( 'instance_name', default='atextcrawler' ): postgresql_identifier, Required('instance_type', default='prod'): VAny( 'dev', 'staging', 'prod', ), Required('log_level', default='info'): VAny( 'critical', 'error', 'warning', 'info', 'debug', ), Required('plugins_dir', default='plugins'): All( str, Length(min=1) ), Required('postgresql'): schema_postgresql, Required('crawl'): schema_crawl, Required('elasticsearch'): schema_elasticsearch, Required('tensorflow'): schema_tensorflow, }, plugins_dir, ) ) if __name__ == '__main__': from pprint import pprint pprint(Config().get())