From 3e6a69ce7f109f0fd4352507ad60d58d4cbd24a7 Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Fri, 25 Nov 2022 14:43:12 +0100 Subject: builders and pipeline --- bsie/tools/__init__.py | 20 +++++ bsie/tools/builder.py | 217 +++++++++++++++++++++++++++++++++++++++++++++++++ bsie/tools/pipeline.py | 121 +++++++++++++++++++++++++++ 3 files changed, 358 insertions(+) create mode 100644 bsie/tools/__init__.py create mode 100644 bsie/tools/builder.py create mode 100644 bsie/tools/pipeline.py (limited to 'bsie/tools') diff --git a/bsie/tools/__init__.py b/bsie/tools/__init__.py new file mode 100644 index 0000000..8ca9620 --- /dev/null +++ b/bsie/tools/__init__.py @@ -0,0 +1,20 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from . import builder +from . import pipeline + +# exports +__all__: typing.Sequence[str] = ( + 'builder', + 'pipeline', + ) + +## EOF ## diff --git a/bsie/tools/builder.py b/bsie/tools/builder.py new file mode 100644 index 0000000..8f7a410 --- /dev/null +++ b/bsie/tools/builder.py @@ -0,0 +1,217 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import importlib +import logging +import typing + +# bsie imports +from bsie import base +from bsie.base import errors +from bsie.utils.bsfs import URI, typename + +# inner-module imports +from . import pipeline + +# exports +__all__: typing.Sequence[str] = ( + 'ExtractorBuilder', + 'PipelineBuilder', + 'ReaderBuilder', + ) + + +## code ## + +logger = logging.getLogger(__name__) + +def _safe_load(module_name: str, class_name: str): + """Get a class from a module. Raise BuilderError if anything goes wrong.""" + try: + # load the module + module = importlib.import_module(module_name) + except Exception as err: + # cannot import module + raise errors.LoaderError(f'cannot load module {module_name}') from err + + try: + # get the class from the module + cls = getattr(module, class_name) + except Exception as err: + # cannot find the class + raise errors.LoaderError(f'cannot load class {class_name} from module {module_name}') from err + + return cls + + +def _unpack_name(name): + """Split a name into its module and class component (dot-separated).""" + if not isinstance(name, str): + raise TypeError(name) + if '.' not in name: + raise ValueError('name must be a qualified class name.') + module_name, class_name = name[:name.rfind('.')], name[name.rfind('.')+1:] + if module_name == '': + raise ValueError('name must be a qualified class name.') + return module_name, class_name + + +class ReaderBuilder(): + """Build `bsie.base.reader.Reader` instances. + + Readers are defined via their qualified class name + (e.g., bsie.reader.path.Path) and optional keyword + arguments that are passed to the constructor via + the *kwargs* argument (name as key, kwargs as value). + The ReaderBuilder keeps a cache of previously built + reader instances, as they are anyway built with + identical keyword arguments. + + """ + + # keyword arguments + kwargs: typing.Dict[str, typing.Dict[str, typing.Any]] + + # cached readers + cache: typing.Dict[str, base.reader.Reader] + + def __init__(self, kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]): + self.kwargs = kwargs + self.cache = {} + + def build(self, name: str) -> base.reader.Reader: + """Return an instance for the qualified class name.""" + # return cached instance + if name in self.cache: + return self.cache[name] + + # check name and get module/class components + module_name, class_name = _unpack_name(name) + + # import reader class + cls = _safe_load(module_name, class_name) + + # get kwargs + kwargs = self.kwargs.get(name, {}) + if not isinstance(kwargs, dict): + raise TypeError(f'expected a kwargs dict, found {typename(kwargs)}') + + try: # build, cache, and return instance + obj = cls(**kwargs) + # cache instance + self.cache[name] = obj + # return instance + return obj + + except Exception as err: + raise errors.BuilderError(f'failed to build reader {name} due to {typename(err)}: {err}') from err + + +class ExtractorBuilder(): + """Build `bsie.base.extractor.Extractor instances. + + It is permissible to build multiple instances of the same extractor + (typically with different arguments), hence the ExtractorBuilder + receives a list of build specifications. Each specification is + a dict with a single key (extractor's qualified name) and a dict + to be used as keyword arguments. + Example: [{'bsie.extractor.generic.path.Path': {}}, ] + + """ + + # build specifications + specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]] + + def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]): + self.specs = specs + + def __iter__(self) -> typing.Iterator[int]: + """Iterate over extractor specifications.""" + return iter(range(len(self.specs))) + + def build(self, index: int) -> base.extractor.Extractor: + """Return an instance of the n'th extractor (n=*index*).""" + # get build instructions + specs = self.specs[index] + + # check specs structure. expecting[{name: {kwargs}}] + if not isinstance(specs, dict): + raise TypeError(f'expected a dict, found {typename(specs)}') + if len(specs) != 1: + raise TypeError(f'expected a dict of length one, found {len(specs)}') + + # get name and args from specs + name = next(iter(specs.keys())) + kwargs = specs[name] + + # check kwargs structure + if not isinstance(kwargs, dict): + raise TypeError(f'expected a dict, found {typename(kwargs)}') + + # check name and get module/class components + module_name, class_name = _unpack_name(name) + + # import extractor class + cls = _safe_load(module_name, class_name) + + try: # build and return instance + return cls(**kwargs) + + except Exception as err: + raise errors.BuilderError(f'failed to build extractor {name} due to {typename(err)}: {err}') from err + + +class PipelineBuilder(): + """Build `bsie.tools.pipeline.Pipeline` instances.""" + + def __init__( + self, + prefix: URI, + reader_builder: ReaderBuilder, + extractor_builder: ExtractorBuilder, + ): + self.prefix = prefix + self.rbuild = reader_builder + self.ebuild = extractor_builder + + def build(self) -> pipeline.Pipeline: + """Return a Pipeline instance.""" + ext2rdr = {} + + for eidx in self.ebuild: + # build extractor + try: + ext = self.ebuild.build(eidx) + + except errors.LoaderError as err: # failed to load extractor; skip + logger.error('failed to load extractor: %s', err) + continue + + except errors.BuilderError as err: # failed to build instance; skip + logger.error(str(err)) + continue + + try: + # get reader required by extractor + if ext.CONTENT_READER is not None: + rdr = self.rbuild.build(ext.CONTENT_READER) + else: + rdr = None + # store extractor + ext2rdr[ext] = rdr + + except errors.LoaderError as err: # failed to load reader + logger.error('failed to load reader: %s', err) + + except errors.BuilderError as err: # failed to build reader + logger.error(str(err)) + + return pipeline.Pipeline(self.prefix, ext2rdr) + + + +## EOF ## diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py new file mode 100644 index 0000000..8e1c992 --- /dev/null +++ b/bsie/tools/pipeline.py @@ -0,0 +1,121 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +from collections import defaultdict +import logging +import typing + +# bsie imports +from bsie import base +from bsie.utils import ns +from bsie.utils.node import Node +from bsie.utils.bsfs import schema as _schema, URI, uuid as _uuid, typename + +# exports +__all__: typing.Sequence[str] = ( + 'Pipeline', + ) + +## code ## + +logger = logging.getLogger(__name__) + +class Pipeline(): + """Extraction pipeline to generate triples from files. + + The Pipeline binds readers and extractors, and performs + the necessary operations to produce triples from a file. + It takes a best-effort approach to extract as many triples + as possible. Errors during the extraction are passed over + and reported to the log. + + """ + + # combined extractor schemas. + schema: _schema.Schema + + # node prefix. + _prefix: URI + + # extractor -> reader mapping + _ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]] + + def __init__( + self, + prefix: URI, + ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]] + ): + # store core members + self._prefix = prefix + self._ext2rdr = ext2rdr + # compile schema from all extractors + self.schema = _schema.Schema.Union(ext.schema for ext in ext2rdr) + + def __str__(self) -> str: + return typename(self) + + def __repr__(self) -> str: + return f'{typename(self)}(...)' + + def __hash__(self) -> int: + return hash((type(self), self._prefix, self.schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values()))) + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) \ + and self.schema == other.schema \ + and self._prefix == other._prefix \ + and self._ext2rdr == other._ext2rdr + + def __call__( + self, + path: URI, + predicates: typing.Optional[typing.Iterable[_schema.Predicate]] = None, + ) -> typing.Iterator[typing.Tuple[Node, _schema.Predicate, typing.Any]]: + """Extract triples from the file at *path*. Optionally, limit triples to *predicates*.""" + # get predicates + predicates = set(predicates) if predicates is not None else set(self.schema.predicates()) + + # get extractors + extractors = {ext for ext in self._ext2rdr if not set(ext.predicates()).isdisjoint(predicates)} + + # corner-case short-cut + if len(extractors) == 0: + return + + # get readers -> extractors mapping + rdr2ext = defaultdict(set) + for ext in extractors: + rdr = self._ext2rdr[ext] + rdr2ext[rdr].add(ext) + + # create subject for file + uuid = _uuid.UCID.from_path(path) + subject = Node(ns.bsfs.Entity, self._prefix + uuid) + + # extract information + for rdr, extrs in rdr2ext.items(): + try: + # get content + content = rdr(path) if rdr is not None else None + + # apply extractors on this content + for ext in extrs: + try: + # get predicate/value tuples + for node, pred, value in ext.extract(subject, content, predicates): + yield node, pred, value + + except base.errors.ExtractorError as err: + # critical extractor failure. + logger.error('%s failed to extract triples from content: %s', ext, err) + + except base.errors.ReaderError as err: + # failed to read any content. skip. + logger.error('%s failed to read content: %s', rdr, err) + + +## EOF ## -- cgit v1.2.3 From 559e643bb1fa39feefd2eb73847ad9420daf1deb Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Wed, 14 Dec 2022 06:10:25 +0100 Subject: bsie extraction and info apps --- bsie/tools/pipeline.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'bsie/tools') diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py index 8e1c992..da422c0 100644 --- a/bsie/tools/pipeline.py +++ b/bsie/tools/pipeline.py @@ -70,6 +70,10 @@ class Pipeline(): and self._prefix == other._prefix \ and self._ext2rdr == other._ext2rdr + def predicates(self) -> typing.Iterator[_schema.Predicate]: + """Return the predicates that are extracted from a file.""" + return iter({pred for ext in self._ext2rdr for pred in ext.predicates()}) + def __call__( self, path: URI, -- cgit v1.2.3 From 49cf03fc212c813862453de5352436dc90d1e458 Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Thu, 15 Dec 2022 16:50:53 +0100 Subject: imports and init files --- bsie/tools/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'bsie/tools') diff --git a/bsie/tools/__init__.py b/bsie/tools/__init__.py index 8ca9620..803c321 100644 --- a/bsie/tools/__init__.py +++ b/bsie/tools/__init__.py @@ -9,12 +9,12 @@ import typing # inner-module imports from . import builder -from . import pipeline +from .pipeline import Pipeline # exports __all__: typing.Sequence[str] = ( 'builder', - 'pipeline', + 'Pipeline', ) ## EOF ## -- cgit v1.2.3 From 3b7fee369924eb7704709edeb8c17fff9c020dfb Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Thu, 15 Dec 2022 17:06:09 +0100 Subject: import fixes --- bsie/tools/builder.py | 17 +++++++++-------- bsie/tools/pipeline.py | 6 +++--- 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'bsie/tools') diff --git a/bsie/tools/builder.py b/bsie/tools/builder.py index 8f7a410..8c6b931 100644 --- a/bsie/tools/builder.py +++ b/bsie/tools/builder.py @@ -13,6 +13,7 @@ import typing from bsie import base from bsie.base import errors from bsie.utils.bsfs import URI, typename +from bsie.utils import bsfs # inner-module imports from . import pipeline @@ -61,7 +62,7 @@ def _unpack_name(name): class ReaderBuilder(): - """Build `bsie.base.reader.Reader` instances. + """Build `bsie.base.Reader` instances. Readers are defined via their qualified class name (e.g., bsie.reader.path.Path) and optional keyword @@ -83,7 +84,7 @@ class ReaderBuilder(): self.kwargs = kwargs self.cache = {} - def build(self, name: str) -> base.reader.Reader: + def build(self, name: str) -> base.Reader: """Return an instance for the qualified class name.""" # return cached instance if name in self.cache: @@ -98,7 +99,7 @@ class ReaderBuilder(): # get kwargs kwargs = self.kwargs.get(name, {}) if not isinstance(kwargs, dict): - raise TypeError(f'expected a kwargs dict, found {typename(kwargs)}') + raise TypeError(f'expected a kwargs dict, found {bsfs.typename(kwargs)}') try: # build, cache, and return instance obj = cls(**kwargs) @@ -108,11 +109,11 @@ class ReaderBuilder(): return obj except Exception as err: - raise errors.BuilderError(f'failed to build reader {name} due to {typename(err)}: {err}') from err + raise errors.BuilderError(f'failed to build reader {name} due to {bsfs.typename(err)}: {err}') from err class ExtractorBuilder(): - """Build `bsie.base.extractor.Extractor instances. + """Build `bsie.base.Extractor instances. It is permissible to build multiple instances of the same extractor (typically with different arguments), hence the ExtractorBuilder @@ -133,14 +134,14 @@ class ExtractorBuilder(): """Iterate over extractor specifications.""" return iter(range(len(self.specs))) - def build(self, index: int) -> base.extractor.Extractor: + def build(self, index: int) -> base.Extractor: """Return an instance of the n'th extractor (n=*index*).""" # get build instructions specs = self.specs[index] # check specs structure. expecting[{name: {kwargs}}] if not isinstance(specs, dict): - raise TypeError(f'expected a dict, found {typename(specs)}') + raise TypeError(f'expected a dict, found {bsfs.typename(specs)}') if len(specs) != 1: raise TypeError(f'expected a dict of length one, found {len(specs)}') @@ -150,7 +151,7 @@ class ExtractorBuilder(): # check kwargs structure if not isinstance(kwargs, dict): - raise TypeError(f'expected a dict, found {typename(kwargs)}') + raise TypeError(f'expected a dict, found {bsfs.typename(kwargs)}') # check name and get module/class components module_name, class_name = _unpack_name(name) diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py index da422c0..7fdd935 100644 --- a/bsie/tools/pipeline.py +++ b/bsie/tools/pipeline.py @@ -11,9 +11,9 @@ import typing # bsie imports from bsie import base -from bsie.utils import ns from bsie.utils.node import Node from bsie.utils.bsfs import schema as _schema, URI, uuid as _uuid, typename +from bsie.utils import bsfs, node, ns # exports __all__: typing.Sequence[str] = ( @@ -56,10 +56,10 @@ class Pipeline(): self.schema = _schema.Schema.Union(ext.schema for ext in ext2rdr) def __str__(self) -> str: - return typename(self) + return bsfs.typename(self) def __repr__(self) -> str: - return f'{typename(self)}(...)' + return f'{bsfs.typename(self)}(...)' def __hash__(self) -> int: return hash((type(self), self._prefix, self.schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values()))) -- cgit v1.2.3 From 8e6d27ea75d2c8d68f6dd8b3d529aaa278f291cc Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Thu, 15 Dec 2022 17:12:56 +0100 Subject: file node class in default schema --- bsie/tools/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'bsie/tools') diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py index 7fdd935..3d08993 100644 --- a/bsie/tools/pipeline.py +++ b/bsie/tools/pipeline.py @@ -97,8 +97,8 @@ class Pipeline(): rdr2ext[rdr].add(ext) # create subject for file - uuid = _uuid.UCID.from_path(path) - subject = Node(ns.bsfs.Entity, self._prefix + uuid) + uuid = bsfs.uuid.UCID.from_path(path) + subject = node.Node(ns.bsfs.File, self._prefix + 'file#' + uuid) # extract information for rdr, extrs in rdr2ext.items(): -- cgit v1.2.3 From 5d9526783ad8432c7d6dfe18c0e9f2b37950b470 Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Thu, 15 Dec 2022 17:16:25 +0100 Subject: Pipeline.prefix as Namespace instead of URI --- bsie/tools/builder.py | 13 +++++++++++-- bsie/tools/pipeline.py | 4 ++-- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'bsie/tools') diff --git a/bsie/tools/builder.py b/bsie/tools/builder.py index 8c6b931..24aea84 100644 --- a/bsie/tools/builder.py +++ b/bsie/tools/builder.py @@ -163,15 +163,24 @@ class ExtractorBuilder(): return cls(**kwargs) except Exception as err: - raise errors.BuilderError(f'failed to build extractor {name} due to {typename(err)}: {err}') from err + raise errors.BuilderError(f'failed to build extractor {name} due to {bsfs.typename(err)}: {err}') from err class PipelineBuilder(): """Build `bsie.tools.pipeline.Pipeline` instances.""" + # Prefix to be used in the Pipeline. + prefix: bsfs.Namespace + + # builder for Readers. + rbuild: ReaderBuilder + + # builder for Extractors. + ebuild: ExtractorBuilder + def __init__( self, - prefix: URI, + prefix: bsfs.Namespace, reader_builder: ReaderBuilder, extractor_builder: ExtractorBuilder, ): diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py index 3d08993..834bd99 100644 --- a/bsie/tools/pipeline.py +++ b/bsie/tools/pipeline.py @@ -39,14 +39,14 @@ class Pipeline(): schema: _schema.Schema # node prefix. - _prefix: URI + _prefix: bsfs.Namespace # extractor -> reader mapping _ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]] def __init__( self, - prefix: URI, + prefix: bsfs.Namespace, ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]] ): # store core members -- cgit v1.2.3 From 37510d134458bf954ca2da6d40be0d6c76661e8c Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Thu, 15 Dec 2022 17:19:21 +0100 Subject: bsie/pipeline interface revision: * predicates -> principals * schema as property * principals as property * information hiding * full subschema instead of only predicates --- bsie/tools/pipeline.py | 52 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 18 deletions(-) (limited to 'bsie/tools') diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py index 834bd99..52ce526 100644 --- a/bsie/tools/pipeline.py +++ b/bsie/tools/pipeline.py @@ -11,8 +11,6 @@ import typing # bsie imports from bsie import base -from bsie.utils.node import Node -from bsie.utils.bsfs import schema as _schema, URI, uuid as _uuid, typename from bsie.utils import bsfs, node, ns # exports @@ -36,7 +34,7 @@ class Pipeline(): """ # combined extractor schemas. - schema: _schema.Schema + _schema: bsfs.schema.Schema # node prefix. _prefix: bsfs.Namespace @@ -53,7 +51,7 @@ class Pipeline(): self._prefix = prefix self._ext2rdr = ext2rdr # compile schema from all extractors - self.schema = _schema.Schema.Union(ext.schema for ext in ext2rdr) + self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr) def __str__(self) -> str: return bsfs.typename(self) @@ -62,29 +60,47 @@ class Pipeline(): return f'{bsfs.typename(self)}(...)' def __hash__(self) -> int: - return hash((type(self), self._prefix, self.schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values()))) + return hash((type(self), self._prefix, self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values()))) def __eq__(self, other: typing.Any) -> bool: return isinstance(other, type(self)) \ - and self.schema == other.schema \ + and self._schema == other._schema \ and self._prefix == other._prefix \ and self._ext2rdr == other._ext2rdr - def predicates(self) -> typing.Iterator[_schema.Predicate]: - """Return the predicates that are extracted from a file.""" - return iter({pred for ext in self._ext2rdr for pred in ext.predicates()}) + @property + def schema(self) -> bsfs.schema.Schema: + """Return the pipeline's schema (combined from all extractors).""" + return self._schema + + @property + def principals(self) -> typing.Iterator[bsfs.schema.Predicate]: + """Return the principal predicates that can be extracted.""" + return iter({pred for ext in self._ext2rdr for pred in ext.principals}) + + def subschema(self, principals: typing.Iterable[bsfs.schema.Predicate]) -> bsfs.schema.Schema: + """Return the subset of the schema that supports the given *principals*.""" + # materialize principals + principals = set(principals) + # collect and combine schemas from extractors + return bsfs.schema.Schema.Union({ + ext.schema + for ext + in self._ext2rdr + if not set(ext.principals).isdisjoint(principals) + }) def __call__( self, - path: URI, - predicates: typing.Optional[typing.Iterable[_schema.Predicate]] = None, - ) -> typing.Iterator[typing.Tuple[Node, _schema.Predicate, typing.Any]]: - """Extract triples from the file at *path*. Optionally, limit triples to *predicates*.""" - # get predicates - predicates = set(predicates) if predicates is not None else set(self.schema.predicates()) + path: bsfs.URI, + principals: typing.Optional[typing.Iterable[bsfs.schema.Predicate]] = None, + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + """Extract triples from the file at *path*. Optionally, limit triples to *principals*.""" + # get principals + principals = set(principals) if principals is not None else set(self.schema.predicates()) # get extractors - extractors = {ext for ext in self._ext2rdr if not set(ext.predicates()).isdisjoint(predicates)} + extractors = {ext for ext in self._ext2rdr if not set(ext.principals).isdisjoint(principals)} # corner-case short-cut if len(extractors) == 0: @@ -110,8 +126,8 @@ class Pipeline(): for ext in extrs: try: # get predicate/value tuples - for node, pred, value in ext.extract(subject, content, predicates): - yield node, pred, value + for subject, pred, value in ext.extract(subject, content, principals): + yield subject, pred, value except base.errors.ExtractorError as err: # critical extractor failure. -- cgit v1.2.3 From 3b41b2a4b7532c911b63b41066a75b3e1546d214 Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Thu, 15 Dec 2022 17:21:20 +0100 Subject: minor test improvements and information hiding in builder --- bsie/tools/builder.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'bsie/tools') diff --git a/bsie/tools/builder.py b/bsie/tools/builder.py index 24aea84..190d9bf 100644 --- a/bsie/tools/builder.py +++ b/bsie/tools/builder.py @@ -12,7 +12,6 @@ import typing # bsie imports from bsie import base from bsie.base import errors -from bsie.utils.bsfs import URI, typename from bsie.utils import bsfs # inner-module imports @@ -75,20 +74,20 @@ class ReaderBuilder(): """ # keyword arguments - kwargs: typing.Dict[str, typing.Dict[str, typing.Any]] + _kwargs: typing.Dict[str, typing.Dict[str, typing.Any]] # cached readers - cache: typing.Dict[str, base.reader.Reader] + _cache: typing.Dict[str, base.Reader] def __init__(self, kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]): - self.kwargs = kwargs - self.cache = {} + self._kwargs = kwargs + self._cache = {} def build(self, name: str) -> base.Reader: """Return an instance for the qualified class name.""" # return cached instance - if name in self.cache: - return self.cache[name] + if name in self._cache: + return self._cache[name] # check name and get module/class components module_name, class_name = _unpack_name(name) @@ -97,14 +96,14 @@ class ReaderBuilder(): cls = _safe_load(module_name, class_name) # get kwargs - kwargs = self.kwargs.get(name, {}) + kwargs = self._kwargs.get(name, {}) if not isinstance(kwargs, dict): raise TypeError(f'expected a kwargs dict, found {bsfs.typename(kwargs)}') try: # build, cache, and return instance obj = cls(**kwargs) # cache instance - self.cache[name] = obj + self._cache[name] = obj # return instance return obj @@ -125,19 +124,19 @@ class ExtractorBuilder(): """ # build specifications - specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]] + _specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]] def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]): - self.specs = specs + self._specs = specs def __iter__(self) -> typing.Iterator[int]: """Iterate over extractor specifications.""" - return iter(range(len(self.specs))) + return iter(range(len(self._specs))) def build(self, index: int) -> base.Extractor: """Return an instance of the n'th extractor (n=*index*).""" # get build instructions - specs = self.specs[index] + specs = self._specs[index] # check specs structure. expecting[{name: {kwargs}}] if not isinstance(specs, dict): -- cgit v1.2.3 From 057e09d6537bf5c39815661a75819081e3e5fda7 Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Sun, 18 Dec 2022 13:37:59 +0100 Subject: adaptions to updates in bsfs --- bsie/tools/pipeline.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'bsie/tools') diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py index 52ce526..20e8ddf 100644 --- a/bsie/tools/pipeline.py +++ b/bsie/tools/pipeline.py @@ -18,6 +18,9 @@ __all__: typing.Sequence[str] = ( 'Pipeline', ) +# constants +FILE_PREFIX = 'file#' + ## code ## logger = logging.getLogger(__name__) @@ -48,7 +51,7 @@ class Pipeline(): ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]] ): # store core members - self._prefix = prefix + self._prefix = prefix + FILE_PREFIX self._ext2rdr = ext2rdr # compile schema from all extractors self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr) @@ -114,7 +117,7 @@ class Pipeline(): # create subject for file uuid = bsfs.uuid.UCID.from_path(path) - subject = node.Node(ns.bsfs.File, self._prefix + 'file#' + uuid) + subject = node.Node(ns.bsfs.File, self._prefix[uuid]) # extract information for rdr, extrs in rdr2ext.items(): -- cgit v1.2.3