aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/tools/builder.py
diff options
context:
space:
mode:
authorMatthias Baumgartner <dev@igsor.net>2022-11-25 14:43:12 +0100
committerMatthias Baumgartner <dev@igsor.net>2022-11-25 14:43:12 +0100
commit3e6a69ce7f109f0fd4352507ad60d58d4cbd24a7 (patch)
treef038ed8d4f04c63991939e13e61ae170de4e2c57 /bsie/tools/builder.py
parentc9a1dea230054f5d6f40b7fd5e3930609c5f6416 (diff)
downloadbsie-3e6a69ce7f109f0fd4352507ad60d58d4cbd24a7.tar.gz
bsie-3e6a69ce7f109f0fd4352507ad60d58d4cbd24a7.tar.bz2
bsie-3e6a69ce7f109f0fd4352507ad60d58d4cbd24a7.zip
builders and pipeline
Diffstat (limited to 'bsie/tools/builder.py')
-rw-r--r--bsie/tools/builder.py217
1 files changed, 217 insertions, 0 deletions
diff --git a/bsie/tools/builder.py b/bsie/tools/builder.py
new file mode 100644
index 0000000..8f7a410
--- /dev/null
+++ b/bsie/tools/builder.py
@@ -0,0 +1,217 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import importlib
+import logging
+import typing
+
+# bsie imports
+from bsie import base
+from bsie.base import errors
+from bsie.utils.bsfs import URI, typename
+
+# inner-module imports
+from . import pipeline
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'ExtractorBuilder',
+ 'PipelineBuilder',
+ 'ReaderBuilder',
+ )
+
+
+## code ##
+
+logger = logging.getLogger(__name__)
+
+def _safe_load(module_name: str, class_name: str):
+ """Get a class from a module. Raise BuilderError if anything goes wrong."""
+ try:
+ # load the module
+ module = importlib.import_module(module_name)
+ except Exception as err:
+ # cannot import module
+ raise errors.LoaderError(f'cannot load module {module_name}') from err
+
+ try:
+ # get the class from the module
+ cls = getattr(module, class_name)
+ except Exception as err:
+ # cannot find the class
+ raise errors.LoaderError(f'cannot load class {class_name} from module {module_name}') from err
+
+ return cls
+
+
+def _unpack_name(name):
+ """Split a name into its module and class component (dot-separated)."""
+ if not isinstance(name, str):
+ raise TypeError(name)
+ if '.' not in name:
+ raise ValueError('name must be a qualified class name.')
+ module_name, class_name = name[:name.rfind('.')], name[name.rfind('.')+1:]
+ if module_name == '':
+ raise ValueError('name must be a qualified class name.')
+ return module_name, class_name
+
+
+class ReaderBuilder():
+ """Build `bsie.base.reader.Reader` instances.
+
+ Readers are defined via their qualified class name
+ (e.g., bsie.reader.path.Path) and optional keyword
+ arguments that are passed to the constructor via
+ the *kwargs* argument (name as key, kwargs as value).
+ The ReaderBuilder keeps a cache of previously built
+ reader instances, as they are anyway built with
+ identical keyword arguments.
+
+ """
+
+ # keyword arguments
+ kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]
+
+ # cached readers
+ cache: typing.Dict[str, base.reader.Reader]
+
+ def __init__(self, kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]):
+ self.kwargs = kwargs
+ self.cache = {}
+
+ def build(self, name: str) -> base.reader.Reader:
+ """Return an instance for the qualified class name."""
+ # return cached instance
+ if name in self.cache:
+ return self.cache[name]
+
+ # check name and get module/class components
+ module_name, class_name = _unpack_name(name)
+
+ # import reader class
+ cls = _safe_load(module_name, class_name)
+
+ # get kwargs
+ kwargs = self.kwargs.get(name, {})
+ if not isinstance(kwargs, dict):
+ raise TypeError(f'expected a kwargs dict, found {typename(kwargs)}')
+
+ try: # build, cache, and return instance
+ obj = cls(**kwargs)
+ # cache instance
+ self.cache[name] = obj
+ # return instance
+ return obj
+
+ except Exception as err:
+ raise errors.BuilderError(f'failed to build reader {name} due to {typename(err)}: {err}') from err
+
+
+class ExtractorBuilder():
+ """Build `bsie.base.extractor.Extractor instances.
+
+ It is permissible to build multiple instances of the same extractor
+ (typically with different arguments), hence the ExtractorBuilder
+ receives a list of build specifications. Each specification is
+ a dict with a single key (extractor's qualified name) and a dict
+ to be used as keyword arguments.
+ Example: [{'bsie.extractor.generic.path.Path': {}}, ]
+
+ """
+
+ # build specifications
+ specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]
+
+ def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]):
+ self.specs = specs
+
+ def __iter__(self) -> typing.Iterator[int]:
+ """Iterate over extractor specifications."""
+ return iter(range(len(self.specs)))
+
+ def build(self, index: int) -> base.extractor.Extractor:
+ """Return an instance of the n'th extractor (n=*index*)."""
+ # get build instructions
+ specs = self.specs[index]
+
+ # check specs structure. expecting[{name: {kwargs}}]
+ if not isinstance(specs, dict):
+ raise TypeError(f'expected a dict, found {typename(specs)}')
+ if len(specs) != 1:
+ raise TypeError(f'expected a dict of length one, found {len(specs)}')
+
+ # get name and args from specs
+ name = next(iter(specs.keys()))
+ kwargs = specs[name]
+
+ # check kwargs structure
+ if not isinstance(kwargs, dict):
+ raise TypeError(f'expected a dict, found {typename(kwargs)}')
+
+ # check name and get module/class components
+ module_name, class_name = _unpack_name(name)
+
+ # import extractor class
+ cls = _safe_load(module_name, class_name)
+
+ try: # build and return instance
+ return cls(**kwargs)
+
+ except Exception as err:
+ raise errors.BuilderError(f'failed to build extractor {name} due to {typename(err)}: {err}') from err
+
+
+class PipelineBuilder():
+ """Build `bsie.tools.pipeline.Pipeline` instances."""
+
+ def __init__(
+ self,
+ prefix: URI,
+ reader_builder: ReaderBuilder,
+ extractor_builder: ExtractorBuilder,
+ ):
+ self.prefix = prefix
+ self.rbuild = reader_builder
+ self.ebuild = extractor_builder
+
+ def build(self) -> pipeline.Pipeline:
+ """Return a Pipeline instance."""
+ ext2rdr = {}
+
+ for eidx in self.ebuild:
+ # build extractor
+ try:
+ ext = self.ebuild.build(eidx)
+
+ except errors.LoaderError as err: # failed to load extractor; skip
+ logger.error('failed to load extractor: %s', err)
+ continue
+
+ except errors.BuilderError as err: # failed to build instance; skip
+ logger.error(str(err))
+ continue
+
+ try:
+ # get reader required by extractor
+ if ext.CONTENT_READER is not None:
+ rdr = self.rbuild.build(ext.CONTENT_READER)
+ else:
+ rdr = None
+ # store extractor
+ ext2rdr[ext] = rdr
+
+ except errors.LoaderError as err: # failed to load reader
+ logger.error('failed to load reader: %s', err)
+
+ except errors.BuilderError as err: # failed to build reader
+ logger.error(str(err))
+
+ return pipeline.Pipeline(self.prefix, ext2rdr)
+
+
+
+## EOF ##