From 266c2c9a072bf3289fd7f2d75278b7d59528378c Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Sat, 24 Dec 2022 10:27:09 +0100 Subject: package restructuring: base * Reader and Extractor to respective reader/extractor modules * ReaderBuilder to reader module * ExtractorBuilder to extractor module * Loading module in utils (safe_load, unpack_name) * Pipeline and PipelineBuilder to lib module * errors to utils * documentation: "standard import" and "external import" --- bsie/apps/index.py | 16 +-- bsie/apps/info.py | 16 +-- bsie/base/__init__.py | 24 ---- bsie/base/errors.py | 45 -------- bsie/base/extractor.py | 103 ----------------- bsie/base/reader.py | 47 -------- bsie/extractor/__init__.py | 11 +- bsie/extractor/base.py | 103 +++++++++++++++++ bsie/extractor/builder.py | 77 +++++++++++++ bsie/extractor/generic/constant.py | 10 +- bsie/extractor/generic/path.py | 8 +- bsie/extractor/generic/stat.py | 10 +- bsie/lib/__init__.py | 4 +- bsie/lib/bsie.py | 6 +- bsie/lib/builder.py | 85 ++++++++++++++ bsie/lib/pipeline.py | 145 ++++++++++++++++++++++++ bsie/reader/__init__.py | 13 +++ bsie/reader/base.py | 47 ++++++++ bsie/reader/builder.py | 74 ++++++++++++ bsie/reader/path.py | 8 +- bsie/reader/stat.py | 9 +- bsie/tools/__init__.py | 20 ---- bsie/tools/builder.py | 226 ------------------------------------- bsie/tools/pipeline.py | 144 ----------------------- bsie/utils/__init__.py | 9 +- bsie/utils/errors.py | 45 ++++++++ bsie/utils/filematcher/parser.py | 6 +- bsie/utils/loading.py | 54 +++++++++ 28 files changed, 710 insertions(+), 655 deletions(-) delete mode 100644 bsie/base/__init__.py delete mode 100644 bsie/base/errors.py delete mode 100644 bsie/base/extractor.py delete mode 100644 bsie/base/reader.py create mode 100644 bsie/extractor/base.py create mode 100644 bsie/extractor/builder.py create mode 100644 bsie/lib/builder.py create mode 100644 bsie/lib/pipeline.py create mode 100644 bsie/reader/base.py create mode 100644 bsie/reader/builder.py delete mode 100644 bsie/tools/__init__.py delete mode 100644 bsie/tools/builder.py delete mode 100644 bsie/tools/pipeline.py create mode 100644 bsie/utils/errors.py create mode 100644 bsie/utils/loading.py (limited to 'bsie') diff --git a/bsie/apps/index.py b/bsie/apps/index.py index 1dbfdd8..0c6296f 100644 --- a/bsie/apps/index.py +++ b/bsie/apps/index.py @@ -4,16 +4,16 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import argparse import os import typing # bsie imports -from bsie.base import errors -from bsie.lib import BSIE -from bsie.tools import builder -from bsie.utils import bsfs +from bsie.extractor import ExtractorBuilder +from bsie.lib import BSIE, PipelineBuilder +from bsie.reader import ReaderBuilder +from bsie.utils import bsfs, errors # exports __all__: typing.Sequence[str] = ( @@ -44,9 +44,9 @@ def main(argv): # FIXME: Read reader/extractor configs from a config file # reader builder - rbuild = builder.ReaderBuilder({}) + rbuild = ReaderBuilder({}) # extractor builder - ebuild = builder.ExtractorBuilder([ + ebuild = ExtractorBuilder([ {'bsie.extractor.generic.path.Path': {}}, {'bsie.extractor.generic.stat.Stat': {}}, {'bsie.extractor.generic.constant.Constant': dict( @@ -60,7 +60,7 @@ def main(argv): )}, ]) # pipeline builder - pbuild = builder.PipelineBuilder( + pbuild = PipelineBuilder( bsfs.Namespace(args.user + ('/' if not args.user.endswith('/') else '')), rbuild, ebuild, diff --git a/bsie/apps/info.py b/bsie/apps/info.py index eaf1f71..a4e611c 100644 --- a/bsie/apps/info.py +++ b/bsie/apps/info.py @@ -4,15 +4,16 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import argparse import sys import typing # bsie imports -from bsie.base import errors -from bsie.tools import builder -from bsie.utils import bsfs +from bsie.extractor import ExtractorBuilder +from bsie.lib import PipelineBuilder +from bsie.reader import ReaderBuilder +from bsie.utils import bsfs, errors # exports __all__: typing.Sequence[str] = ( @@ -31,9 +32,10 @@ def main(argv): # FIXME: Read reader/extractor configs from a config file # reader builder - rbuild = builder.ReaderBuilder({}) + rbuild = ReaderBuilder({ + }) # extractor builder - ebuild = builder.ExtractorBuilder([ + ebuild = ExtractorBuilder([ {'bsie.extractor.generic.path.Path': {}}, {'bsie.extractor.generic.stat.Stat': {}}, {'bsie.extractor.generic.constant.Constant': dict( @@ -47,7 +49,7 @@ def main(argv): )}, ]) # pipeline builder - pbuild = builder.PipelineBuilder( + pbuild = PipelineBuilder( bsfs.Namespace('http://example.com/me/'), # not actually used rbuild, ebuild, diff --git a/bsie/base/__init__.py b/bsie/base/__init__.py deleted file mode 100644 index 0d362cd..0000000 --- a/bsie/base/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -"""The base module defines the BSIE interfaces. - -You'll mostly find abstract classes here. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports -import typing - -# inner-module imports -from . import errors -from .extractor import Extractor -from .reader import Reader - -# exports -__all__: typing.Sequence[str] = ( - 'Extractor', - 'Reader', - 'errors', - ) - -## EOF ## diff --git a/bsie/base/errors.py b/bsie/base/errors.py deleted file mode 100644 index 5fafd5b..0000000 --- a/bsie/base/errors.py +++ /dev/null @@ -1,45 +0,0 @@ -"""Common BSIE exceptions. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports -import typing - -# exports -__all__: typing.Sequence[str] = ( - 'BuilderError', - 'ExtractorError', - 'LoaderError', - 'ReaderError', - ) - - -## code ## - -class _BSIEError(Exception): - """Generic BSIE error.""" - -class BuilderError(_BSIEError): - """The Builder failed to create an instance.""" - -class LoaderError(BuilderError): - """Failed to load a module or class.""" - -class ExtractorError(_BSIEError): - """The Extractor failed to process the given content.""" - -class ReaderError(_BSIEError): - """The Reader failed to read the given file.""" - -class ProgrammingError(_BSIEError): - """An assertion-like error that indicates a code-base issue.""" - -class UnreachableError(ProgrammingError): - """Bravo, you've reached a point in code that should logically not be reachable.""" - -class ParserError(_BSIEError): - """Failed to parse due to invalid syntax or structures.""" - -## EOF ## diff --git a/bsie/base/extractor.py b/bsie/base/extractor.py deleted file mode 100644 index c44021b..0000000 --- a/bsie/base/extractor.py +++ /dev/null @@ -1,103 +0,0 @@ -"""The Extractor classes transform content into triples. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports -import abc -import typing - -# bsie imports -from bsie.utils import bsfs, node, ns - -# exports -__all__: typing.Sequence[str] = ( - 'Extractor', - ) - -# constants - -# essential definitions typically used in extractor schemas. -# NOTE: This preamble is only for convenience; Each Extractor must implement its use, if so desired. -SCHEMA_PREAMBLE = ''' - # common external prefixes - prefix rdf: - prefix rdfs: - prefix xsd: - prefix schema: - - # common bsfs prefixes - prefix bsfs: - prefix bse: - - # essential nodes - bsfs:Entity rdfs:subClassOf bsfs:Node . - bsfs:File rdfs:subClassOf bsfs:Entity . - - # common definitions - xsd:string rdfs:subClassOf bsfs:Literal . - xsd:integer rdfs:subClassOf bsfs:Literal . - - ''' - - -## code ## - -class Extractor(abc.ABC): - """Produce (subject, predicate, value)-triples from some content. - The Extractor produces princpal predicates that provide information - about the content itself (i.e., triples that include the subject), - and may also generate triples with auxiliary predicates if the - extracted value is a node itself. - """ - - # what type of content is expected (i.e. reader subclass). - CONTENT_READER: typing.Optional[str] = None - - # extractor schema. - _schema: bsfs.schema.Schema - - def __init__(self, schema: bsfs.schema.Schema): - self._schema = schema - - def __str__(self) -> str: - return bsfs.typename(self) - - def __repr__(self) -> str: - return f'{bsfs.typename(self)}()' - - def __eq__(self, other: typing.Any) -> bool: - return isinstance(other, type(self)) \ - and self.CONTENT_READER == other.CONTENT_READER \ - and self.schema == other.schema - - def __hash__(self) -> int: - return hash((type(self), self.CONTENT_READER, self.schema)) - - @property - def schema(self) -> bsfs.schema.Schema: - """Return the extractor's schema.""" - return self._schema - - @property - def principals(self) -> typing.Iterator[bsfs.schema.Predicate]: - """Return the principal predicates, i.e., relations from/to the extraction subject.""" - ent = self.schema.node(ns.bsfs.Entity) - return ( - pred - for pred - in self.schema.predicates() - if pred.domain <= ent or (pred.range is not None and pred.range <= ent) - ) - - @abc.abstractmethod - def extract( - self, - subject: node.Node, - content: typing.Any, - principals: typing.Iterable[bsfs.schema.Predicate], - ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: - """Return (node, predicate, value) triples.""" - -## EOF ## diff --git a/bsie/base/reader.py b/bsie/base/reader.py deleted file mode 100644 index cbabd36..0000000 --- a/bsie/base/reader.py +++ /dev/null @@ -1,47 +0,0 @@ -"""The Reader classes return high-level content structures from files. - -The Reader fulfills two purposes: - First, it brokers between multiple libraries and file formats. - Second, it separates multiple aspects of a file into distinct content types. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports -import abc -import typing - -# bsie imports -from bsie.utils import bsfs - -# exports -__all__: typing.Sequence[str] = ( - 'Reader', - ) - - -## code ## - -class Reader(abc.ABC): - """Read and return some content from a file.""" - - def __str__(self) -> str: - return bsfs.typename(self) - - def __repr__(self) -> str: - return f'{bsfs.typename(self)}()' - - def __eq__(self, other: typing.Any) -> bool: - return isinstance(other, type(self)) - - def __hash__(self) -> int: - return hash(type(self)) - - @abc.abstractmethod - def __call__(self, path: bsfs.URI) -> typing.Any: - """Return some content of the file at *path*. - Raises a `ReaderError` if the reader cannot make sense of the file format. - """ - -## EOF ## diff --git a/bsie/extractor/__init__.py b/bsie/extractor/__init__.py index ef31343..5f385ee 100644 --- a/bsie/extractor/__init__.py +++ b/bsie/extractor/__init__.py @@ -6,10 +6,17 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing +# inner-module imports +from .base import Extractor +from .builder import ExtractorBuilder + # exports -__all__: typing.Sequence[str] = [] +__all__: typing.Sequence[str] = ( + 'Extractor', + 'ExtractorBuilder', + ) ## EOF ## diff --git a/bsie/extractor/base.py b/bsie/extractor/base.py new file mode 100644 index 0000000..c44021b --- /dev/null +++ b/bsie/extractor/base.py @@ -0,0 +1,103 @@ +"""The Extractor classes transform content into triples. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import abc +import typing + +# bsie imports +from bsie.utils import bsfs, node, ns + +# exports +__all__: typing.Sequence[str] = ( + 'Extractor', + ) + +# constants + +# essential definitions typically used in extractor schemas. +# NOTE: This preamble is only for convenience; Each Extractor must implement its use, if so desired. +SCHEMA_PREAMBLE = ''' + # common external prefixes + prefix rdf: + prefix rdfs: + prefix xsd: + prefix schema: + + # common bsfs prefixes + prefix bsfs: + prefix bse: + + # essential nodes + bsfs:Entity rdfs:subClassOf bsfs:Node . + bsfs:File rdfs:subClassOf bsfs:Entity . + + # common definitions + xsd:string rdfs:subClassOf bsfs:Literal . + xsd:integer rdfs:subClassOf bsfs:Literal . + + ''' + + +## code ## + +class Extractor(abc.ABC): + """Produce (subject, predicate, value)-triples from some content. + The Extractor produces princpal predicates that provide information + about the content itself (i.e., triples that include the subject), + and may also generate triples with auxiliary predicates if the + extracted value is a node itself. + """ + + # what type of content is expected (i.e. reader subclass). + CONTENT_READER: typing.Optional[str] = None + + # extractor schema. + _schema: bsfs.schema.Schema + + def __init__(self, schema: bsfs.schema.Schema): + self._schema = schema + + def __str__(self) -> str: + return bsfs.typename(self) + + def __repr__(self) -> str: + return f'{bsfs.typename(self)}()' + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) \ + and self.CONTENT_READER == other.CONTENT_READER \ + and self.schema == other.schema + + def __hash__(self) -> int: + return hash((type(self), self.CONTENT_READER, self.schema)) + + @property + def schema(self) -> bsfs.schema.Schema: + """Return the extractor's schema.""" + return self._schema + + @property + def principals(self) -> typing.Iterator[bsfs.schema.Predicate]: + """Return the principal predicates, i.e., relations from/to the extraction subject.""" + ent = self.schema.node(ns.bsfs.Entity) + return ( + pred + for pred + in self.schema.predicates() + if pred.domain <= ent or (pred.range is not None and pred.range <= ent) + ) + + @abc.abstractmethod + def extract( + self, + subject: node.Node, + content: typing.Any, + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + """Return (node, predicate, value) triples.""" + +## EOF ## diff --git a/bsie/extractor/builder.py b/bsie/extractor/builder.py new file mode 100644 index 0000000..0fd3685 --- /dev/null +++ b/bsie/extractor/builder.py @@ -0,0 +1,77 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# standard imports +import typing + +# bsie imports +from bsie.utils import bsfs, errors, safe_load, unpack_qualified_name + +# inner-module imports +from . import base + +# exports +__all__: typing.Sequence[str] = ( + 'ExtractorBuilder', + ) + + +## code ## + +class ExtractorBuilder(): + """Build `bsie.base.Extractor instances. + + It is permissible to build multiple instances of the same extractor + (typically with different arguments), hence the ExtractorBuilder + receives a list of build specifications. Each specification is + a dict with a single key (extractor's qualified name) and a dict + to be used as keyword arguments. + Example: [{'bsie.extractor.generic.path.Path': {}}, ] + + """ + + # build specifications + _specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]] + + def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]): + self._specs = specs + + def __iter__(self) -> typing.Iterator[int]: + """Iterate over extractor specifications.""" + return iter(range(len(self._specs))) + + def build(self, index: int) -> base.Extractor: + """Return an instance of the n'th extractor (n=*index*).""" + # get build instructions + specs = self._specs[index] + + # check specs structure. expecting[{name: {kwargs}}] + if not isinstance(specs, dict): + raise TypeError(f'expected a dict, found {bsfs.typename(specs)}') + if len(specs) != 1: + raise TypeError(f'expected a dict of length one, found {len(specs)}') + + # get name and args from specs + name = next(iter(specs.keys())) + kwargs = specs[name] + + # check kwargs structure + if not isinstance(kwargs, dict): + raise TypeError(f'expected a dict, found {bsfs.typename(kwargs)}') + + # check name and get module/class components + module_name, class_name = unpack_qualified_name(name) + + # import extractor class + cls = safe_load(module_name, class_name) + + try: # build and return instance + return cls(**kwargs) + + except Exception as err: + raise errors.BuilderError(f'failed to build extractor {name} due to {bsfs.typename(err)}: {err}') from err + +## EOF ## diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py index 11384e6..7b1d942 100644 --- a/bsie/extractor/generic/constant.py +++ b/bsie/extractor/generic/constant.py @@ -4,13 +4,15 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # bsie imports -from bsie.base import extractor from bsie.utils import bsfs, node +# inner-module imports +from .. import base + # exports __all__: typing.Sequence[str] = ( 'Constant', @@ -19,7 +21,7 @@ __all__: typing.Sequence[str] = ( ## code ## -class Constant(extractor.Extractor): +class Constant(base.Extractor): """Extract information from file's path.""" CONTENT_READER = None @@ -32,7 +34,7 @@ class Constant(extractor.Extractor): schema: str, tuples: typing.Iterable[typing.Tuple[bsfs.URI, typing.Any]], ): - super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + schema)) + super().__init__(bsfs.schema.Schema.from_string(base.SCHEMA_PREAMBLE + schema)) # NOTE: Raises a KeyError if the predicate is not part of the schema self._tuples = tuple((self.schema.predicate(p_uri), value) for p_uri, value in tuples) # TODO: use schema instance for value checking diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py index 7018e12..295715f 100644 --- a/bsie/extractor/generic/path.py +++ b/bsie/extractor/generic/path.py @@ -4,12 +4,12 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import os import typing # bsie imports -from bsie.base import extractor +from bsie.extractor import base from bsie.utils import bsfs, node, ns # exports @@ -20,7 +20,7 @@ __all__: typing.Sequence[str] = ( ## code ## -class Path(extractor.Extractor): +class Path(base.Extractor): """Extract information from file's path.""" CONTENT_READER = 'bsie.reader.path.Path' @@ -29,7 +29,7 @@ class Path(extractor.Extractor): _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]] def __init__(self): - super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' + super().__init__(bsfs.schema.Schema.from_string(base.SCHEMA_PREAMBLE + ''' bse:filename rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:File ; rdfs:range xsd:string ; diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py index 0b9ce29..1381fe2 100644 --- a/bsie/extractor/generic/stat.py +++ b/bsie/extractor/generic/stat.py @@ -4,14 +4,16 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import os import typing # bsie imports -from bsie.base import extractor from bsie.utils import bsfs, node, ns +# inner-module imports +from .. import base + # exports __all__: typing.Sequence[str] = ( 'Stat', @@ -20,7 +22,7 @@ __all__: typing.Sequence[str] = ( ## code ## -class Stat(extractor.Extractor): +class Stat(base.Extractor): """Extract information from the file system.""" CONTENT_READER = 'bsie.reader.stat.Stat' @@ -29,7 +31,7 @@ class Stat(extractor.Extractor): _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[os.stat_result], typing.Any]] def __init__(self): - super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' + super().__init__(bsfs.schema.Schema.from_string(base.SCHEMA_PREAMBLE + ''' bse:filesize rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:File ; rdfs:range xsd:integer ; diff --git a/bsie/lib/__init__.py b/bsie/lib/__init__.py index 578c2c4..4239d3b 100644 --- a/bsie/lib/__init__.py +++ b/bsie/lib/__init__.py @@ -4,15 +4,17 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # inner-module imports from .bsie import BSIE +from .builder import PipelineBuilder # exports __all__: typing.Sequence[str] = ( 'BSIE', + 'PipelineBuilder', ) ## EOF ## diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py index e087fa9..668783d 100644 --- a/bsie/lib/bsie.py +++ b/bsie/lib/bsie.py @@ -4,13 +4,15 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # bsie imports -from bsie.tools import Pipeline from bsie.utils import bsfs, node, ns +# inner-module imports +from .pipeline import Pipeline + # exports __all__: typing.Sequence[str] = ( 'BSIE', diff --git a/bsie/lib/builder.py b/bsie/lib/builder.py new file mode 100644 index 0000000..c2abffe --- /dev/null +++ b/bsie/lib/builder.py @@ -0,0 +1,85 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# standard imports +import logging +import typing + +# bsie imports +from bsie.extractor import ExtractorBuilder +from bsie.reader import ReaderBuilder +from bsie.utils import bsfs, errors + +# inner-module imports +from . import pipeline + +# exports +__all__: typing.Sequence[str] = ( + 'PipelineBuilder', + ) + + +## code ## + +logger = logging.getLogger(__name__) + +class PipelineBuilder(): + """Build `bsie.tools.pipeline.Pipeline` instances.""" + + # Prefix to be used in the Pipeline. + prefix: bsfs.Namespace + + # builder for Readers. + rbuild: ReaderBuilder + + # builder for Extractors. + ebuild: ExtractorBuilder + + def __init__( + self, + prefix: bsfs.Namespace, + reader_builder: ReaderBuilder, + extractor_builder: ExtractorBuilder, + ): + self.prefix = prefix + self.rbuild = reader_builder + self.ebuild = extractor_builder + + def build(self) -> pipeline.Pipeline: + """Return a Pipeline instance.""" + ext2rdr = {} + + for eidx in self.ebuild: + # build extractor + try: + ext = self.ebuild.build(eidx) + + except errors.LoaderError as err: # failed to load extractor; skip + logger.error('failed to load extractor: %s', err) + continue + + except errors.BuilderError as err: # failed to build instance; skip + logger.error(str(err)) + continue + + try: + # get reader required by extractor + if ext.CONTENT_READER is not None: + rdr = self.rbuild.build(ext.CONTENT_READER) + else: + rdr = None + # store extractor + ext2rdr[ext] = rdr + + except errors.LoaderError as err: # failed to load reader + logger.error('failed to load reader: %s', err) + + except errors.BuilderError as err: # failed to build reader + logger.error(str(err)) + + return pipeline.Pipeline(self.prefix, ext2rdr) + +## EOF ## diff --git a/bsie/lib/pipeline.py b/bsie/lib/pipeline.py new file mode 100644 index 0000000..e5ce1b7 --- /dev/null +++ b/bsie/lib/pipeline.py @@ -0,0 +1,145 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# standard imports +from collections import defaultdict +import logging +import typing + +# bsie imports +from bsie.extractor import Extractor +from bsie.reader import Reader +from bsie.utils import bsfs, errors, node, ns + +# exports +__all__: typing.Sequence[str] = ( + 'Pipeline', + ) + +# constants +FILE_PREFIX = 'file#' + +## code ## + +logger = logging.getLogger(__name__) + +class Pipeline(): + """Extraction pipeline to generate triples from files. + + The Pipeline binds readers and extractors, and performs + the necessary operations to produce triples from a file. + It takes a best-effort approach to extract as many triples + as possible. Errors during the extraction are passed over + and reported to the log. + + """ + + # combined extractor schemas. + _schema: bsfs.schema.Schema + + # node prefix. + _prefix: bsfs.Namespace + + # extractor -> reader mapping + _ext2rdr: typing.Dict[Extractor, typing.Optional[Reader]] + + def __init__( + self, + prefix: bsfs.Namespace, + ext2rdr: typing.Dict[Extractor, typing.Optional[Reader]] + ): + # store core members + self._prefix = prefix + FILE_PREFIX + self._ext2rdr = ext2rdr + # compile schema from all extractors + self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr) + + def __str__(self) -> str: + return bsfs.typename(self) + + def __repr__(self) -> str: + return f'{bsfs.typename(self)}(...)' + + def __hash__(self) -> int: + return hash((type(self), self._prefix, self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values()))) + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) \ + and self._schema == other._schema \ + and self._prefix == other._prefix \ + and self._ext2rdr == other._ext2rdr + + @property + def schema(self) -> bsfs.schema.Schema: + """Return the pipeline's schema (combined from all extractors).""" + return self._schema + + @property + def principals(self) -> typing.Iterator[bsfs.schema.Predicate]: + """Return the principal predicates that can be extracted.""" + return iter({pred for ext in self._ext2rdr for pred in ext.principals}) + + def subschema(self, principals: typing.Iterable[bsfs.schema.Predicate]) -> bsfs.schema.Schema: + """Return the subset of the schema that supports the given *principals*.""" + # materialize principals + principals = set(principals) + # collect and combine schemas from extractors + return bsfs.schema.Schema.Union({ + ext.schema + for ext + in self._ext2rdr + if not set(ext.principals).isdisjoint(principals) + }) + + def __call__( + self, + path: bsfs.URI, + principals: typing.Optional[typing.Iterable[bsfs.schema.Predicate]] = None, + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + """Extract triples from the file at *path*. Optionally, limit triples to *principals*.""" + # get principals + principals = set(principals) if principals is not None else set(self.schema.predicates()) + + # get extractors + extractors = {ext for ext in self._ext2rdr if not set(ext.principals).isdisjoint(principals)} + + # corner-case short-cut + if len(extractors) == 0: + return + + # get readers -> extractors mapping + rdr2ext = defaultdict(set) + for ext in extractors: + rdr = self._ext2rdr[ext] + rdr2ext[rdr].add(ext) + + # create subject for file + uuid = bsfs.uuid.UCID.from_path(path) + subject = node.Node(ns.bsfs.File, self._prefix[uuid]) + + # extract information + for rdr, extrs in rdr2ext.items(): + try: + # get content + content = rdr(path) if rdr is not None else None + + # apply extractors on this content + for ext in extrs: + try: + # get predicate/value tuples + for subject, pred, value in ext.extract(subject, content, principals): + yield subject, pred, value + + except errors.ExtractorError as err: + # critical extractor failure. + logger.error('%s failed to extract triples from content: %s', ext, err) + + except errors.ReaderError as err: + # failed to read any content. skip. + logger.error('%s failed to read content: %s', rdr, err) + + +## EOF ## diff --git a/bsie/reader/__init__.py b/bsie/reader/__init__.py index a45f22b..4163d1c 100644 --- a/bsie/reader/__init__.py +++ b/bsie/reader/__init__.py @@ -15,5 +15,18 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ +# standard imports +import typing +# inner-module imports +from .base import Reader +from .builder import ReaderBuilder + +# exports +__all__: typing.Sequence[str] = ( + 'Reader', + 'ReaderBuilder', + ) + +## EOF ## ## EOF ## diff --git a/bsie/reader/base.py b/bsie/reader/base.py new file mode 100644 index 0000000..cbabd36 --- /dev/null +++ b/bsie/reader/base.py @@ -0,0 +1,47 @@ +"""The Reader classes return high-level content structures from files. + +The Reader fulfills two purposes: + First, it brokers between multiple libraries and file formats. + Second, it separates multiple aspects of a file into distinct content types. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import abc +import typing + +# bsie imports +from bsie.utils import bsfs + +# exports +__all__: typing.Sequence[str] = ( + 'Reader', + ) + + +## code ## + +class Reader(abc.ABC): + """Read and return some content from a file.""" + + def __str__(self) -> str: + return bsfs.typename(self) + + def __repr__(self) -> str: + return f'{bsfs.typename(self)}()' + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) + + def __hash__(self) -> int: + return hash(type(self)) + + @abc.abstractmethod + def __call__(self, path: bsfs.URI) -> typing.Any: + """Return some content of the file at *path*. + Raises a `ReaderError` if the reader cannot make sense of the file format. + """ + +## EOF ## diff --git a/bsie/reader/builder.py b/bsie/reader/builder.py new file mode 100644 index 0000000..bce5397 --- /dev/null +++ b/bsie/reader/builder.py @@ -0,0 +1,74 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# standard imports +import typing + +# bsie imports +from bsie.utils import bsfs, errors, safe_load, unpack_qualified_name + +# inner-module imports +from . import base + +# exports +__all__: typing.Sequence[str] = ( + 'ReaderBuilder', + ) + + +## code ## + +class ReaderBuilder(): + """Build `bsie.base.Reader` instances. + + Readers are defined via their qualified class name + (e.g., bsie.reader.path.Path) and optional keyword + arguments that are passed to the constructor via + the *kwargs* argument (name as key, kwargs as value). + The ReaderBuilder keeps a cache of previously built + reader instances, as they are anyway built with + identical keyword arguments. + + """ + + # keyword arguments + _kwargs: typing.Dict[str, typing.Dict[str, typing.Any]] + + # cached readers + _cache: typing.Dict[str, base.Reader] + + def __init__(self, kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]): + self._kwargs = kwargs + self._cache = {} + + def build(self, name: str) -> base.Reader: + """Return an instance for the qualified class name.""" + # return cached instance + if name in self._cache: + return self._cache[name] + + # check name and get module/class components + module_name, class_name = unpack_qualified_name(name) + + # import reader class + cls = safe_load(module_name, class_name) + + # get kwargs + kwargs = self._kwargs.get(name, {}) + if not isinstance(kwargs, dict): + raise TypeError(f'expected a kwargs dict, found {bsfs.typename(kwargs)}') + + try: # build, cache, and return instance + obj = cls(**kwargs) + # cache instance + self._cache[name] = obj + # return instance + return obj + + except Exception as err: + raise errors.BuilderError(f'failed to build reader {name} due to {bsfs.typename(err)}: {err}') from err + +## EOF ## diff --git a/bsie/reader/path.py b/bsie/reader/path.py index d60f187..1ca05a0 100644 --- a/bsie/reader/path.py +++ b/bsie/reader/path.py @@ -4,11 +4,11 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing -# bsie imports -from bsie.base import reader +# inner-module imports +from . import base # exports __all__: typing.Sequence[str] = ( @@ -18,7 +18,7 @@ __all__: typing.Sequence[str] = ( ## code ## -class Path(reader.Reader): +class Path(base.Reader): """Return the path.""" def __call__(self, path: str) -> str: diff --git a/bsie/reader/stat.py b/bsie/reader/stat.py index fc5fb24..706dc47 100644 --- a/bsie/reader/stat.py +++ b/bsie/reader/stat.py @@ -4,12 +4,15 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import os import typing # bsie imports -from bsie.base import errors, reader +from bsie.utils import errors + +# inner-module imports +from . import base # exports __all__: typing.Sequence[str] = ( @@ -19,7 +22,7 @@ __all__: typing.Sequence[str] = ( ## code ## -class Stat(reader.Reader): +class Stat(base.Reader): """Read and return the filesystem's stat infos.""" def __call__(self, path: str) -> os.stat_result: diff --git a/bsie/tools/__init__.py b/bsie/tools/__init__.py deleted file mode 100644 index 803c321..0000000 --- a/bsie/tools/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -""" - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports -import typing - -# inner-module imports -from . import builder -from .pipeline import Pipeline - -# exports -__all__: typing.Sequence[str] = ( - 'builder', - 'Pipeline', - ) - -## EOF ## diff --git a/bsie/tools/builder.py b/bsie/tools/builder.py deleted file mode 100644 index 190d9bf..0000000 --- a/bsie/tools/builder.py +++ /dev/null @@ -1,226 +0,0 @@ -""" - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports -import importlib -import logging -import typing - -# bsie imports -from bsie import base -from bsie.base import errors -from bsie.utils import bsfs - -# inner-module imports -from . import pipeline - -# exports -__all__: typing.Sequence[str] = ( - 'ExtractorBuilder', - 'PipelineBuilder', - 'ReaderBuilder', - ) - - -## code ## - -logger = logging.getLogger(__name__) - -def _safe_load(module_name: str, class_name: str): - """Get a class from a module. Raise BuilderError if anything goes wrong.""" - try: - # load the module - module = importlib.import_module(module_name) - except Exception as err: - # cannot import module - raise errors.LoaderError(f'cannot load module {module_name}') from err - - try: - # get the class from the module - cls = getattr(module, class_name) - except Exception as err: - # cannot find the class - raise errors.LoaderError(f'cannot load class {class_name} from module {module_name}') from err - - return cls - - -def _unpack_name(name): - """Split a name into its module and class component (dot-separated).""" - if not isinstance(name, str): - raise TypeError(name) - if '.' not in name: - raise ValueError('name must be a qualified class name.') - module_name, class_name = name[:name.rfind('.')], name[name.rfind('.')+1:] - if module_name == '': - raise ValueError('name must be a qualified class name.') - return module_name, class_name - - -class ReaderBuilder(): - """Build `bsie.base.Reader` instances. - - Readers are defined via their qualified class name - (e.g., bsie.reader.path.Path) and optional keyword - arguments that are passed to the constructor via - the *kwargs* argument (name as key, kwargs as value). - The ReaderBuilder keeps a cache of previously built - reader instances, as they are anyway built with - identical keyword arguments. - - """ - - # keyword arguments - _kwargs: typing.Dict[str, typing.Dict[str, typing.Any]] - - # cached readers - _cache: typing.Dict[str, base.Reader] - - def __init__(self, kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]): - self._kwargs = kwargs - self._cache = {} - - def build(self, name: str) -> base.Reader: - """Return an instance for the qualified class name.""" - # return cached instance - if name in self._cache: - return self._cache[name] - - # check name and get module/class components - module_name, class_name = _unpack_name(name) - - # import reader class - cls = _safe_load(module_name, class_name) - - # get kwargs - kwargs = self._kwargs.get(name, {}) - if not isinstance(kwargs, dict): - raise TypeError(f'expected a kwargs dict, found {bsfs.typename(kwargs)}') - - try: # build, cache, and return instance - obj = cls(**kwargs) - # cache instance - self._cache[name] = obj - # return instance - return obj - - except Exception as err: - raise errors.BuilderError(f'failed to build reader {name} due to {bsfs.typename(err)}: {err}') from err - - -class ExtractorBuilder(): - """Build `bsie.base.Extractor instances. - - It is permissible to build multiple instances of the same extractor - (typically with different arguments), hence the ExtractorBuilder - receives a list of build specifications. Each specification is - a dict with a single key (extractor's qualified name) and a dict - to be used as keyword arguments. - Example: [{'bsie.extractor.generic.path.Path': {}}, ] - - """ - - # build specifications - _specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]] - - def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]): - self._specs = specs - - def __iter__(self) -> typing.Iterator[int]: - """Iterate over extractor specifications.""" - return iter(range(len(self._specs))) - - def build(self, index: int) -> base.Extractor: - """Return an instance of the n'th extractor (n=*index*).""" - # get build instructions - specs = self._specs[index] - - # check specs structure. expecting[{name: {kwargs}}] - if not isinstance(specs, dict): - raise TypeError(f'expected a dict, found {bsfs.typename(specs)}') - if len(specs) != 1: - raise TypeError(f'expected a dict of length one, found {len(specs)}') - - # get name and args from specs - name = next(iter(specs.keys())) - kwargs = specs[name] - - # check kwargs structure - if not isinstance(kwargs, dict): - raise TypeError(f'expected a dict, found {bsfs.typename(kwargs)}') - - # check name and get module/class components - module_name, class_name = _unpack_name(name) - - # import extractor class - cls = _safe_load(module_name, class_name) - - try: # build and return instance - return cls(**kwargs) - - except Exception as err: - raise errors.BuilderError(f'failed to build extractor {name} due to {bsfs.typename(err)}: {err}') from err - - -class PipelineBuilder(): - """Build `bsie.tools.pipeline.Pipeline` instances.""" - - # Prefix to be used in the Pipeline. - prefix: bsfs.Namespace - - # builder for Readers. - rbuild: ReaderBuilder - - # builder for Extractors. - ebuild: ExtractorBuilder - - def __init__( - self, - prefix: bsfs.Namespace, - reader_builder: ReaderBuilder, - extractor_builder: ExtractorBuilder, - ): - self.prefix = prefix - self.rbuild = reader_builder - self.ebuild = extractor_builder - - def build(self) -> pipeline.Pipeline: - """Return a Pipeline instance.""" - ext2rdr = {} - - for eidx in self.ebuild: - # build extractor - try: - ext = self.ebuild.build(eidx) - - except errors.LoaderError as err: # failed to load extractor; skip - logger.error('failed to load extractor: %s', err) - continue - - except errors.BuilderError as err: # failed to build instance; skip - logger.error(str(err)) - continue - - try: - # get reader required by extractor - if ext.CONTENT_READER is not None: - rdr = self.rbuild.build(ext.CONTENT_READER) - else: - rdr = None - # store extractor - ext2rdr[ext] = rdr - - except errors.LoaderError as err: # failed to load reader - logger.error('failed to load reader: %s', err) - - except errors.BuilderError as err: # failed to build reader - logger.error(str(err)) - - return pipeline.Pipeline(self.prefix, ext2rdr) - - - -## EOF ## diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py deleted file mode 100644 index 20e8ddf..0000000 --- a/bsie/tools/pipeline.py +++ /dev/null @@ -1,144 +0,0 @@ -""" - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports -from collections import defaultdict -import logging -import typing - -# bsie imports -from bsie import base -from bsie.utils import bsfs, node, ns - -# exports -__all__: typing.Sequence[str] = ( - 'Pipeline', - ) - -# constants -FILE_PREFIX = 'file#' - -## code ## - -logger = logging.getLogger(__name__) - -class Pipeline(): - """Extraction pipeline to generate triples from files. - - The Pipeline binds readers and extractors, and performs - the necessary operations to produce triples from a file. - It takes a best-effort approach to extract as many triples - as possible. Errors during the extraction are passed over - and reported to the log. - - """ - - # combined extractor schemas. - _schema: bsfs.schema.Schema - - # node prefix. - _prefix: bsfs.Namespace - - # extractor -> reader mapping - _ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]] - - def __init__( - self, - prefix: bsfs.Namespace, - ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]] - ): - # store core members - self._prefix = prefix + FILE_PREFIX - self._ext2rdr = ext2rdr - # compile schema from all extractors - self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr) - - def __str__(self) -> str: - return bsfs.typename(self) - - def __repr__(self) -> str: - return f'{bsfs.typename(self)}(...)' - - def __hash__(self) -> int: - return hash((type(self), self._prefix, self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values()))) - - def __eq__(self, other: typing.Any) -> bool: - return isinstance(other, type(self)) \ - and self._schema == other._schema \ - and self._prefix == other._prefix \ - and self._ext2rdr == other._ext2rdr - - @property - def schema(self) -> bsfs.schema.Schema: - """Return the pipeline's schema (combined from all extractors).""" - return self._schema - - @property - def principals(self) -> typing.Iterator[bsfs.schema.Predicate]: - """Return the principal predicates that can be extracted.""" - return iter({pred for ext in self._ext2rdr for pred in ext.principals}) - - def subschema(self, principals: typing.Iterable[bsfs.schema.Predicate]) -> bsfs.schema.Schema: - """Return the subset of the schema that supports the given *principals*.""" - # materialize principals - principals = set(principals) - # collect and combine schemas from extractors - return bsfs.schema.Schema.Union({ - ext.schema - for ext - in self._ext2rdr - if not set(ext.principals).isdisjoint(principals) - }) - - def __call__( - self, - path: bsfs.URI, - principals: typing.Optional[typing.Iterable[bsfs.schema.Predicate]] = None, - ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: - """Extract triples from the file at *path*. Optionally, limit triples to *principals*.""" - # get principals - principals = set(principals) if principals is not None else set(self.schema.predicates()) - - # get extractors - extractors = {ext for ext in self._ext2rdr if not set(ext.principals).isdisjoint(principals)} - - # corner-case short-cut - if len(extractors) == 0: - return - - # get readers -> extractors mapping - rdr2ext = defaultdict(set) - for ext in extractors: - rdr = self._ext2rdr[ext] - rdr2ext[rdr].add(ext) - - # create subject for file - uuid = bsfs.uuid.UCID.from_path(path) - subject = node.Node(ns.bsfs.File, self._prefix[uuid]) - - # extract information - for rdr, extrs in rdr2ext.items(): - try: - # get content - content = rdr(path) if rdr is not None else None - - # apply extractors on this content - for ext in extrs: - try: - # get predicate/value tuples - for subject, pred, value in ext.extract(subject, content, principals): - yield subject, pred, value - - except base.errors.ExtractorError as err: - # critical extractor failure. - logger.error('%s failed to extract triples from content: %s', ext, err) - - except base.errors.ReaderError as err: - # failed to read any content. skip. - logger.error('%s failed to read content: %s', rdr, err) - - -## EOF ## diff --git a/bsie/utils/__init__.py b/bsie/utils/__init__.py index 3981dc7..9cb60ed 100644 --- a/bsie/utils/__init__.py +++ b/bsie/utils/__init__.py @@ -4,21 +4,24 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # inner-module imports from . import bsfs +from . import filematcher from . import namespaces as ns from . import node -from . import filematcher +from .loading import safe_load, unpack_qualified_name # exports __all__: typing.Sequence[str] = ( - 'filematcher', 'bsfs', + 'filematcher', 'node', 'ns', + 'safe_load', + 'unpack_qualified_name', ) ## EOF ## diff --git a/bsie/utils/errors.py b/bsie/utils/errors.py new file mode 100644 index 0000000..5fafd5b --- /dev/null +++ b/bsie/utils/errors.py @@ -0,0 +1,45 @@ +"""Common BSIE exceptions. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# exports +__all__: typing.Sequence[str] = ( + 'BuilderError', + 'ExtractorError', + 'LoaderError', + 'ReaderError', + ) + + +## code ## + +class _BSIEError(Exception): + """Generic BSIE error.""" + +class BuilderError(_BSIEError): + """The Builder failed to create an instance.""" + +class LoaderError(BuilderError): + """Failed to load a module or class.""" + +class ExtractorError(_BSIEError): + """The Extractor failed to process the given content.""" + +class ReaderError(_BSIEError): + """The Reader failed to read the given file.""" + +class ProgrammingError(_BSIEError): + """An assertion-like error that indicates a code-base issue.""" + +class UnreachableError(ProgrammingError): + """Bravo, you've reached a point in code that should logically not be reachable.""" + +class ParserError(_BSIEError): + """Failed to parse due to invalid syntax or structures.""" + +## EOF ## diff --git a/bsie/utils/filematcher/parser.py b/bsie/utils/filematcher/parser.py index 0654742..2f82875 100644 --- a/bsie/utils/filematcher/parser.py +++ b/bsie/utils/filematcher/parser.py @@ -7,16 +7,14 @@ Author: Matthias Baumgartner, 2021 # standard imports import typing -# non-standard imports +# external imports import pyparsing from pyparsing import printables, alphas8bit, punc8bit, QuotedString, Word, \ delimitedList, Or, CaselessKeyword, Group, oneOf, Optional -# bsie imports -from bsie.base import errors - # inner-module imports from . import matcher +from .. import errors # exports __all__: typing.Sequence[str] = ( diff --git a/bsie/utils/loading.py b/bsie/utils/loading.py new file mode 100644 index 0000000..eb05c35 --- /dev/null +++ b/bsie/utils/loading.py @@ -0,0 +1,54 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# standard imports +import importlib +import typing + +# inner-module imports +from . import errors + +# exports +__all__: typing.Sequence[str] = ( + 'safe_load', + 'unpack_qualified_name', + ) + + +## code ## + +def safe_load(module_name: str, class_name: str): + """Get a class from a module. Raise BuilderError if anything goes wrong.""" + try: + # load the module + module = importlib.import_module(module_name) + except Exception as err: + # cannot import module + raise errors.LoaderError(f'cannot load module {module_name}') from err + + try: + # get the class from the module + cls = getattr(module, class_name) + except Exception as err: + # cannot find the class + raise errors.LoaderError(f'cannot load class {class_name} from module {module_name}') from err + + return cls + + +def unpack_qualified_name(name): + """Split a name into its module and class component (dot-separated).""" + if not isinstance(name, str): + raise TypeError(name) + if '.' not in name: + raise ValueError('name must be a qualified class name.') + module_name, class_name = name[:name.rfind('.')], name[name.rfind('.')+1:] + if module_name == '': + raise ValueError('name must be a qualified class name.') + return module_name, class_name + + +## EOF ## -- cgit v1.2.3