diff options
author | Matthias Baumgartner <dev@igsor.net> | 2022-11-25 14:59:17 +0100 |
---|---|---|
committer | Matthias Baumgartner <dev@igsor.net> | 2022-11-25 14:59:17 +0100 |
commit | a294bbe0622911bcd6df37c38865a4c0eb290593 (patch) | |
tree | f038ed8d4f04c63991939e13e61ae170de4e2c57 | |
parent | 9389c741bdbbca9adbff6099d440706cd63deac4 (diff) | |
parent | 3e6a69ce7f109f0fd4352507ad60d58d4cbd24a7 (diff) | |
download | bsie-a294bbe0622911bcd6df37c38865a4c0eb290593.tar.gz bsie-a294bbe0622911bcd6df37c38865a4c0eb290593.tar.bz2 bsie-a294bbe0622911bcd6df37c38865a4c0eb290593.zip |
Merge branch 'mb/tools' into develop
30 files changed, 1546 insertions, 124 deletions
diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..40f07cc --- /dev/null +++ b/.coveragerc @@ -0,0 +1,15 @@ +[run] +dynamic_context = test_function +branch = True +source = bsie +data_file = .coverage +command_line = -m unittest + +[report] +show_missing = True +skip_empty = True + +[html] +directory = .htmlcov +show_contexts = True + diff --git a/.mypy.ini b/.mypy.ini new file mode 100644 index 0000000..4d0a25d --- /dev/null +++ b/.mypy.ini @@ -0,0 +1,3 @@ +[mypy] +ignore_missing_imports = True +packages=bsie diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..3cfae38 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,193 @@ +[MAIN] + +# Pickle collected data for later comparisons. +persistent=no + +# Minimum Python version to use for version dependent checks. Will default to +# the version used to run pylint. +py-version=3.8 + +# Discover python modules and packages in the file system subtree. +recursive=yes + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + + +[BASIC] + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Bad variable names which should always be refused, separated by a comma. +bad-names=foo,bar,abc,cba,xyz,zyx,foobar,hello,world + +# Good variable names which should always be accepted, separated by a comma. +good-names=i,j,k,n,_ + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Naming style matching correct class constant names. +class-const-naming-style=UPPER_CASE + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Naming style matching correct constant names. +const-naming-style=UPPER_CASE + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=yes + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Naming style matching correct variable names. +variable-naming-style=snake_case + + +[DESIGN] + +# Maximum number of arguments for function / method. +max-args=5 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=12 + +# Maximum number of locals for function / method body. +max-locals=15 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=6 + +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=1 + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )?<?https?://\S+>?$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=120 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + + +[LOGGING] + +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style=old + + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME,TODO,NOTE + + + +[REPORTS] + +# Tells whether to display a full report or only the messages. +reports=yes + +# Activate the evaluation score. +score=yes + + +[SIMILARITIES] + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=yes + + +[TYPECHECK] + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=no + + +[VARIABLES] + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=clbk,callback + + + + +# Disable: R1735 (use-dict-literal) @@ -3,3 +3,54 @@ Black Star Information Extraction ================================= +### Developer tools setup + +#### Test coverage (coverage) + +Resources: +* https://coverage.readthedocs.io/en/6.5.0/index.html +* https://nedbatchelder.com/blog/200710/flaws_in_coverage_measurement.html + +Commands: +$ pip install coverage +$ coverage run ; coverage html ; xdg-open .htmlcov/index.html + + + +#### Static code analysis (pylint) + +Resources: +* https://github.com/PyCQA/pylint +* https://pylint.org/ +* https://pylint.pycqa.org/en/latest/user_guide/messages/messages_overview.html#messages-overview + +Commands: +$ pip install pylint +$ pylint bsie + + + +#### Type analysis (mypy) + +Resources: +* https://github.com/python/mypy +* https://mypy.readthedocs.io/en/stable/ + +Commands: +$ pip install mypy +$ mypy + + + +#### Documentation (sphinx) + +Resources: +* +* + +Commands: +$ pip install ... +$ + + + diff --git a/bsie/base/errors.py b/bsie/base/errors.py index f86ffb2..760351f 100644 --- a/bsie/base/errors.py +++ b/bsie/base/errors.py @@ -8,15 +8,29 @@ Author: Matthias Baumgartner, 2022 import typing # exports -__all__: typing.Sequence[str] = [] +__all__: typing.Sequence[str] = ( + 'BuilderError', + 'ExtractorError', + 'LoaderError', + 'ReaderError', + ) ## code ## -class _BSIE_Error(Exception): +class _BSIEError(Exception): """Generic BSIE error.""" -class ReaderError(_BSIE_Error): +class BuilderError(_BSIEError): + """The Builder failed to create an instance.""" + +class LoaderError(BuilderError): + """Failed to load a module or class.""" + +class ExtractorError(_BSIEError): + """The Extractor failed to process the given content.""" + +class ReaderError(_BSIEError): """The Reader failed to read the given file.""" ## EOF ## diff --git a/bsie/base/extractor.py b/bsie/base/extractor.py index ea43925..2fc4f18 100644 --- a/bsie/base/extractor.py +++ b/bsie/base/extractor.py @@ -8,16 +8,40 @@ Author: Matthias Baumgartner, 2022 import abc import typing -# inner-module imports -from . import reader +# bsie imports from bsie.utils import node -from bsie.utils.bsfs import URI, typename +from bsie.utils.bsfs import schema as _schema, typename # exports __all__: typing.Sequence[str] = ( 'Extractor', ) +# constants + +# essential definitions typically used in extractor schemas. +# NOTE: The definition here is only for convenience; Each Extractor must implement its use, if so desired. +SCHEMA_PREAMBLE = ''' + # common external prefixes + prefix owl: <http://www.w3.org/2002/07/owl#> + prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> + prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> + prefix xsd: <http://www.w3.org/2001/XMLSchema#> + prefix schema: <http://schema.org/> + + # common bsfs prefixes + prefix bsfs: <http://bsfs.ai/schema/> + prefix bse: <http://bsfs.ai/schema/Entity#> + + # essential nodes + bsfs:Entity rdfs:subClassOf bsfs:Node . + + # common definitions + xsd:string rdfs:subClassOf bsfs:Literal . + xsd:integer rdfs:subClassOf bsfs:Literal . + + ''' + ## code ## @@ -25,7 +49,13 @@ class Extractor(abc.ABC): """Produce (node, predicate, value)-triples from some content.""" # what type of content is expected (i.e. reader subclass). - CONTENT_READER: typing.Optional[typing.Type[reader.Reader]] = None + CONTENT_READER: typing.Optional[str] = None + + # extractor schema. + schema: _schema.Schema + + def __init__(self, schema: _schema.Schema): + self.schema = schema def __str__(self) -> str: return typename(self) @@ -33,17 +63,32 @@ class Extractor(abc.ABC): def __repr__(self) -> str: return f'{typename(self)}()' - @abc.abstractmethod - def schema(self) -> str: - """Return the schema (predicates and nodes) produced by this Extractor.""" + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) \ + and self.CONTENT_READER == other.CONTENT_READER \ + and self.schema == other.schema + + def __hash__(self) -> int: + return hash((type(self), self.CONTENT_READER, self.schema)) + + def predicates(self) -> typing.Iterator[_schema.Predicate]: + """Return the predicates that may be part of extracted triples.""" + # NOTE: Some predicates in the schema might not occur in actual triples, + # but are defined due to predicate class hierarchy. E.g., bsfs:Predicate + # is part of every schema but should not be used in triples. + # Announcing all predicates might not be the most efficient way, however, + # it is the most safe one. Concrete extractors that produce additional + # predicates (e.g. auxiliary nodes with their own predicates) should + # overwrite this method to only include the principal predicates. + return self.schema.predicates() @abc.abstractmethod def extract( self, subject: node.Node, content: typing.Any, - predicates: typing.Iterable[URI], - ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]: + predicates: typing.Iterable[_schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, _schema.Predicate, typing.Any]]: """Return (node, predicate, value) triples.""" ## EOF ## diff --git a/bsie/base/reader.py b/bsie/base/reader.py index f29e451..b7eabf7 100644 --- a/bsie/base/reader.py +++ b/bsie/base/reader.py @@ -12,12 +12,11 @@ Author: Matthias Baumgartner, 2022 import abc import typing -# inner-module imports +# bsie imports from bsie.utils.bsfs import URI, typename # exports __all__: typing.Sequence[str] = ( - 'Aggregator', 'Reader', ) @@ -27,20 +26,20 @@ __all__: typing.Sequence[str] = ( class Reader(abc.ABC): """Read and return some content from a file.""" - # In what data structure content is returned - CONTENT_TYPE = typing.Union[typing.Any] - # NOTE: Child classes must also assign a typing.Union even if there's - # only one options - def __str__(self) -> str: return typename(self) def __repr__(self) -> str: return f'{typename(self)}()' - # FIXME: How about using contexts instead of calls? + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) + + def __hash__(self) -> int: + return hash(type(self)) + @abc.abstractmethod - def __call__(self, path: URI) -> CONTENT_TYPE: + def __call__(self, path: URI) -> typing.Any: """Return some content of the file at *path*. Raises a `ReaderError` if the reader cannot make sense of the file format. """ diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py index e243131..7da792a 100644 --- a/bsie/extractor/generic/constant.py +++ b/bsie/extractor/generic/constant.py @@ -7,9 +7,9 @@ Author: Matthias Baumgartner, 2022 # imports import typing -# inner-module imports +# bsie imports from bsie.base import extractor -from bsie.utils.bsfs import URI +from bsie.utils.bsfs import URI, schema as _schema from bsie.utils.node import Node # exports @@ -25,26 +25,32 @@ class Constant(extractor.Extractor): CONTENT_READER = None + # predicate/value pairs to be produced. + _tuples: typing.Tuple[typing.Tuple[_schema.Predicate, typing.Any], ...] + def __init__( self, schema: str, tuples: typing.Iterable[typing.Tuple[URI, typing.Any]], ): - self._schema = schema - self._tuples = tuples - # FIXME: use schema instance for predicate checking - #self._tuples = [(pred, value) for pred, value in tuples if pred in schema] + super().__init__(_schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + schema)) + # NOTE: Raises a KeyError if the predicate is not part of the schema + self._tuples = tuple((self.schema.predicate(p_uri), value) for p_uri, value in tuples) # FIXME: use schema instance for value checking - def schema(self) -> str: - return self._schema + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) \ + and self._tuples == other._tuples + + def __hash__(self) -> int: + return hash((super().__hash__(), self._tuples)) def extract( self, subject: Node, content: None, - predicates: typing.Iterable[URI], - ) -> typing.Iterator[typing.Tuple[Node, URI, typing.Any]]: + predicates: typing.Iterable[_schema.Predicate], + ) -> typing.Iterator[typing.Tuple[Node, _schema.Predicate, typing.Any]]: for pred, value in self._tuples: if pred in predicates: yield subject, pred, value diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py index c39bbd2..f346f97 100644 --- a/bsie/extractor/generic/path.py +++ b/bsie/extractor/generic/path.py @@ -8,11 +8,10 @@ Author: Matthias Baumgartner, 2022 import os import typing -# inner-module imports +# bsie imports from bsie.base import extractor from bsie.utils import node, ns -from bsie.utils.bsfs import URI -import bsie.reader.path +from bsie.utils.bsfs import schema # exports __all__: typing.Sequence[str] = ( @@ -25,32 +24,33 @@ __all__: typing.Sequence[str] = ( class Path(extractor.Extractor): """Extract information from file's path.""" - CONTENT_READER = bsie.reader.path.Path + CONTENT_READER = 'bsie.reader.path.Path' - def __init__(self): - self.__callmap = { - ns.bse.filename: self.__filename, - } + # mapping from predicate to handler function. + _callmap: typing.Dict[schema.Predicate, typing.Callable[[str], typing.Any]] - def schema(self) -> str: - return ''' - bse:filename a bsfs:Predicate ; + def __init__(self): + super().__init__(schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' + bse:filename rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:Entity ; rdfs:range xsd:string ; - rdf:label "File name"^^xsd:string ; + rdfs:label "File name"^^xsd:string ; schema:description "Filename of entity in some filesystem."^^xsd:string ; owl:maxCardinality "INF"^^xsd:number . - ''' + ''')) + self._callmap = { + self.schema.predicate(ns.bse.filename): self.__filename, + } def extract( self, subject: node.Node, - content: CONTENT_READER.CONTENT_TYPE, - predicates: typing.Iterable[URI], - ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]: + content: str, + predicates: typing.Iterable[schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, schema.Predicate, typing.Any]]: for pred in predicates: # find callback - clbk = self.__callmap.get(pred) + clbk = self._callmap.get(pred) if clbk is None: continue # get value @@ -60,11 +60,15 @@ class Path(extractor.Extractor): # produce triple yield subject, pred, value - def __filename(self, path: str) -> str: + def __filename(self, path: str) -> typing.Optional[str]: try: return os.path.basename(path) - except Exception: - # FIXME: some kind of error reporting (e.g. logging) + except Exception: # some error, skip. + # FIXME: some kind of error reporting (e.g. logging)? + # Options: (a) Fail silently (current); (b) Skip and report to log; + # (c) Raise ExtractorError (aborts extraction); (d) separate content type + # checks from basename errors (report content type errors, skip basename + # errors) return None ## EOF ## diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py index d74369c..7088c0a 100644 --- a/bsie/extractor/generic/stat.py +++ b/bsie/extractor/generic/stat.py @@ -5,14 +5,13 @@ A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ # imports +import os import typing -# inner-module imports +# bsie imports from bsie.base import extractor from bsie.utils import node, ns -from bsie.utils.bsfs import URI -import bsie.reader.stat - +from bsie.utils.bsfs import schema as _schema # exports __all__: typing.Sequence[str] = ( @@ -25,32 +24,33 @@ __all__: typing.Sequence[str] = ( class Stat(extractor.Extractor): """Extract information from the file system.""" - CONTENT_READER = bsie.reader.stat.Stat + CONTENT_READER = 'bsie.reader.stat.Stat' - def __init__(self): - self.__callmap = { - ns.bse.filesize: self.__filesize, - } + # mapping from predicate to handler function. + _callmap: typing.Dict[_schema.Predicate, typing.Callable[[os.stat_result], typing.Any]] - def schema(self) -> str: - return ''' - bse:filesize a bsfs:Predicate ; + def __init__(self): + super().__init__(_schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' + bse:filesize rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:Entity ; rdfs:range xsd:integer ; - rdf:label "File size"^^xsd:string ; + rdfs:label "File size"^^xsd:string ; schema:description "File size of entity in some filesystem."^^xsd:string ; owl:maxCardinality "INF"^^xsd:number . - ''' + ''')) + self._callmap = { + self.schema.predicate(ns.bse.filesize): self.__filesize, + } def extract( self, subject: node.Node, - content: CONTENT_READER.CONTENT_TYPE, - predicates: typing.Iterable[URI], - ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]: + content: os.stat_result, + predicates: typing.Iterable[_schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, _schema.Predicate, typing.Any]]: for pred in predicates: # find callback - clbk = self.__callmap.get(pred) + clbk = self._callmap.get(pred) if clbk is None: continue # get value @@ -60,7 +60,7 @@ class Stat(extractor.Extractor): # produce triple yield subject, pred, value - def __filesize(self, content: CONTENT_READER.CONTENT_TYPE) -> int: + def __filesize(self, content: os.stat_result) -> typing.Optional[int]: """Return the file size.""" try: return content.st_size diff --git a/bsie/reader/path.py b/bsie/reader/path.py index d27c664..d60f187 100644 --- a/bsie/reader/path.py +++ b/bsie/reader/path.py @@ -5,10 +5,9 @@ A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ # imports -import os import typing -# inner-module imports +# bsie imports from bsie.base import reader # exports @@ -22,9 +21,7 @@ __all__: typing.Sequence[str] = ( class Path(reader.Reader): """Return the path.""" - CONTENT_TYPE = typing.Union[str] - - def __call__(self, path: str) -> CONTENT_TYPE: + def __call__(self, path: str) -> str: return path diff --git a/bsie/reader/stat.py b/bsie/reader/stat.py index f0b83fb..592d912 100644 --- a/bsie/reader/stat.py +++ b/bsie/reader/stat.py @@ -8,7 +8,7 @@ Author: Matthias Baumgartner, 2022 import os import typing -# inner-module imports +# bsie imports from bsie.base import reader, errors # exports @@ -22,13 +22,11 @@ __all__: typing.Sequence[str] = ( class Stat(reader.Reader): """Read and return the filesystem's stat infos.""" - CONTENT_TYPE = typing.Union[os.stat_result] - - def __call__(self, path: str) -> CONTENT_TYPE: + def __call__(self, path: str) -> os.stat_result: try: return os.stat(path) - except Exception: - raise errors.ReaderError(path) + except Exception as err: + raise errors.ReaderError(path) from err ## EOF ## diff --git a/bsie/tools/__init__.py b/bsie/tools/__init__.py new file mode 100644 index 0000000..8ca9620 --- /dev/null +++ b/bsie/tools/__init__.py @@ -0,0 +1,20 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from . import builder +from . import pipeline + +# exports +__all__: typing.Sequence[str] = ( + 'builder', + 'pipeline', + ) + +## EOF ## diff --git a/bsie/tools/builder.py b/bsie/tools/builder.py new file mode 100644 index 0000000..8f7a410 --- /dev/null +++ b/bsie/tools/builder.py @@ -0,0 +1,217 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import importlib +import logging +import typing + +# bsie imports +from bsie import base +from bsie.base import errors +from bsie.utils.bsfs import URI, typename + +# inner-module imports +from . import pipeline + +# exports +__all__: typing.Sequence[str] = ( + 'ExtractorBuilder', + 'PipelineBuilder', + 'ReaderBuilder', + ) + + +## code ## + +logger = logging.getLogger(__name__) + +def _safe_load(module_name: str, class_name: str): + """Get a class from a module. Raise BuilderError if anything goes wrong.""" + try: + # load the module + module = importlib.import_module(module_name) + except Exception as err: + # cannot import module + raise errors.LoaderError(f'cannot load module {module_name}') from err + + try: + # get the class from the module + cls = getattr(module, class_name) + except Exception as err: + # cannot find the class + raise errors.LoaderError(f'cannot load class {class_name} from module {module_name}') from err + + return cls + + +def _unpack_name(name): + """Split a name into its module and class component (dot-separated).""" + if not isinstance(name, str): + raise TypeError(name) + if '.' not in name: + raise ValueError('name must be a qualified class name.') + module_name, class_name = name[:name.rfind('.')], name[name.rfind('.')+1:] + if module_name == '': + raise ValueError('name must be a qualified class name.') + return module_name, class_name + + +class ReaderBuilder(): + """Build `bsie.base.reader.Reader` instances. + + Readers are defined via their qualified class name + (e.g., bsie.reader.path.Path) and optional keyword + arguments that are passed to the constructor via + the *kwargs* argument (name as key, kwargs as value). + The ReaderBuilder keeps a cache of previously built + reader instances, as they are anyway built with + identical keyword arguments. + + """ + + # keyword arguments + kwargs: typing.Dict[str, typing.Dict[str, typing.Any]] + + # cached readers + cache: typing.Dict[str, base.reader.Reader] + + def __init__(self, kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]): + self.kwargs = kwargs + self.cache = {} + + def build(self, name: str) -> base.reader.Reader: + """Return an instance for the qualified class name.""" + # return cached instance + if name in self.cache: + return self.cache[name] + + # check name and get module/class components + module_name, class_name = _unpack_name(name) + + # import reader class + cls = _safe_load(module_name, class_name) + + # get kwargs + kwargs = self.kwargs.get(name, {}) + if not isinstance(kwargs, dict): + raise TypeError(f'expected a kwargs dict, found {typename(kwargs)}') + + try: # build, cache, and return instance + obj = cls(**kwargs) + # cache instance + self.cache[name] = obj + # return instance + return obj + + except Exception as err: + raise errors.BuilderError(f'failed to build reader {name} due to {typename(err)}: {err}') from err + + +class ExtractorBuilder(): + """Build `bsie.base.extractor.Extractor instances. + + It is permissible to build multiple instances of the same extractor + (typically with different arguments), hence the ExtractorBuilder + receives a list of build specifications. Each specification is + a dict with a single key (extractor's qualified name) and a dict + to be used as keyword arguments. + Example: [{'bsie.extractor.generic.path.Path': {}}, ] + + """ + + # build specifications + specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]] + + def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]): + self.specs = specs + + def __iter__(self) -> typing.Iterator[int]: + """Iterate over extractor specifications.""" + return iter(range(len(self.specs))) + + def build(self, index: int) -> base.extractor.Extractor: + """Return an instance of the n'th extractor (n=*index*).""" + # get build instructions + specs = self.specs[index] + + # check specs structure. expecting[{name: {kwargs}}] + if not isinstance(specs, dict): + raise TypeError(f'expected a dict, found {typename(specs)}') + if len(specs) != 1: + raise TypeError(f'expected a dict of length one, found {len(specs)}') + + # get name and args from specs + name = next(iter(specs.keys())) + kwargs = specs[name] + + # check kwargs structure + if not isinstance(kwargs, dict): + raise TypeError(f'expected a dict, found {typename(kwargs)}') + + # check name and get module/class components + module_name, class_name = _unpack_name(name) + + # import extractor class + cls = _safe_load(module_name, class_name) + + try: # build and return instance + return cls(**kwargs) + + except Exception as err: + raise errors.BuilderError(f'failed to build extractor {name} due to {typename(err)}: {err}') from err + + +class PipelineBuilder(): + """Build `bsie.tools.pipeline.Pipeline` instances.""" + + def __init__( + self, + prefix: URI, + reader_builder: ReaderBuilder, + extractor_builder: ExtractorBuilder, + ): + self.prefix = prefix + self.rbuild = reader_builder + self.ebuild = extractor_builder + + def build(self) -> pipeline.Pipeline: + """Return a Pipeline instance.""" + ext2rdr = {} + + for eidx in self.ebuild: + # build extractor + try: + ext = self.ebuild.build(eidx) + + except errors.LoaderError as err: # failed to load extractor; skip + logger.error('failed to load extractor: %s', err) + continue + + except errors.BuilderError as err: # failed to build instance; skip + logger.error(str(err)) + continue + + try: + # get reader required by extractor + if ext.CONTENT_READER is not None: + rdr = self.rbuild.build(ext.CONTENT_READER) + else: + rdr = None + # store extractor + ext2rdr[ext] = rdr + + except errors.LoaderError as err: # failed to load reader + logger.error('failed to load reader: %s', err) + + except errors.BuilderError as err: # failed to build reader + logger.error(str(err)) + + return pipeline.Pipeline(self.prefix, ext2rdr) + + + +## EOF ## diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py new file mode 100644 index 0000000..8e1c992 --- /dev/null +++ b/bsie/tools/pipeline.py @@ -0,0 +1,121 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +from collections import defaultdict +import logging +import typing + +# bsie imports +from bsie import base +from bsie.utils import ns +from bsie.utils.node import Node +from bsie.utils.bsfs import schema as _schema, URI, uuid as _uuid, typename + +# exports +__all__: typing.Sequence[str] = ( + 'Pipeline', + ) + +## code ## + +logger = logging.getLogger(__name__) + +class Pipeline(): + """Extraction pipeline to generate triples from files. + + The Pipeline binds readers and extractors, and performs + the necessary operations to produce triples from a file. + It takes a best-effort approach to extract as many triples + as possible. Errors during the extraction are passed over + and reported to the log. + + """ + + # combined extractor schemas. + schema: _schema.Schema + + # node prefix. + _prefix: URI + + # extractor -> reader mapping + _ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]] + + def __init__( + self, + prefix: URI, + ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]] + ): + # store core members + self._prefix = prefix + self._ext2rdr = ext2rdr + # compile schema from all extractors + self.schema = _schema.Schema.Union(ext.schema for ext in ext2rdr) + + def __str__(self) -> str: + return typename(self) + + def __repr__(self) -> str: + return f'{typename(self)}(...)' + + def __hash__(self) -> int: + return hash((type(self), self._prefix, self.schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values()))) + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) \ + and self.schema == other.schema \ + and self._prefix == other._prefix \ + and self._ext2rdr == other._ext2rdr + + def __call__( + self, + path: URI, + predicates: typing.Optional[typing.Iterable[_schema.Predicate]] = None, + ) -> typing.Iterator[typing.Tuple[Node, _schema.Predicate, typing.Any]]: + """Extract triples from the file at *path*. Optionally, limit triples to *predicates*.""" + # get predicates + predicates = set(predicates) if predicates is not None else set(self.schema.predicates()) + + # get extractors + extractors = {ext for ext in self._ext2rdr if not set(ext.predicates()).isdisjoint(predicates)} + + # corner-case short-cut + if len(extractors) == 0: + return + + # get readers -> extractors mapping + rdr2ext = defaultdict(set) + for ext in extractors: + rdr = self._ext2rdr[ext] + rdr2ext[rdr].add(ext) + + # create subject for file + uuid = _uuid.UCID.from_path(path) + subject = Node(ns.bsfs.Entity, self._prefix + uuid) + + # extract information + for rdr, extrs in rdr2ext.items(): + try: + # get content + content = rdr(path) if rdr is not None else None + + # apply extractors on this content + for ext in extrs: + try: + # get predicate/value tuples + for node, pred, value in ext.extract(subject, content, predicates): + yield node, pred, value + + except base.errors.ExtractorError as err: + # critical extractor failure. + logger.error('%s failed to extract triples from content: %s', ext, err) + + except base.errors.ReaderError as err: + # failed to read any content. skip. + logger.error('%s failed to read content: %s', rdr, err) + + +## EOF ## diff --git a/bsie/utils/bsfs.py b/bsie/utils/bsfs.py index 1ae657c..a4b7626 100644 --- a/bsie/utils/bsfs.py +++ b/bsie/utils/bsfs.py @@ -8,14 +8,17 @@ Author: Matthias Baumgartner, 2022 import typing # bsfs imports +from bsfs import schema from bsfs.namespace import Namespace -from bsfs.utils import URI, typename +from bsfs.utils import URI, typename, uuid # exports __all__: typing.Sequence[str] = ( 'Namespace', 'URI', + 'schema', 'typename', + 'uuid', ) ## EOF ## diff --git a/bsie/utils/namespaces.py b/bsie/utils/namespaces.py index 67ccc71..13be96b 100644 --- a/bsie/utils/namespaces.py +++ b/bsie/utils/namespaces.py @@ -7,13 +7,14 @@ Author: Matthias Baumgartner, 2022 # imports import typing -# bsie imports +# inner-module imports from . import bsfs as _bsfs # constants bse = _bsfs.Namespace('http://bsfs.ai/schema/Entity#') bsfs = _bsfs.Namespace('http://bsfs.ai/schema/') bsm = _bsfs.Namespace('http://bsfs.ai/schema/meta#') +xsd = _bsfs.Namespace('http://www.w3.org/2001/XMLSchema#') # export __all__: typing.Sequence[str] = ( diff --git a/bsie/utils/node.py b/bsie/utils/node.py index 60863a4..c9c494f 100644 --- a/bsie/utils/node.py +++ b/bsie/utils/node.py @@ -7,12 +7,12 @@ Author: Matthias Baumgartner, 2022 # imports import typing -# inner-module imports -from bsie.utils.bsfs import URI +# bsie imports +from bsie.utils.bsfs import URI, typename # exports __all__: typing.Sequence[str] = ( - 'Node' + 'Node', ) @@ -36,4 +36,18 @@ class Node(): self.node_type = URI(node_type) self.uri = URI(uri) + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, Node) \ + and other.node_type == self.node_type \ + and other.uri == self.uri + + def __hash__(self) -> int: + return hash((type(self), self.node_type, self.uri)) + + def __str__(self) -> str: + return f'{typename(self)}({self.node_type}, {self.uri})' + + def __repr__(self) -> str: + return f'{typename(self)}({self.node_type}, {self.uri})' + ## EOF ## diff --git a/test/base/__init__.py b/test/base/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/test/base/__init__.py diff --git a/test/base/test_extractor.py b/test/base/test_extractor.py new file mode 100644 index 0000000..7a00079 --- /dev/null +++ b/test/base/test_extractor.py @@ -0,0 +1,70 @@ +""" + +Part of the bsie test suite. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import unittest + +# bsie imports +from bsie.utils import ns +from bsie.utils.bsfs import schema as _schema, URI + +# objects to test +from bsie.base import extractor + + +## code ## + +class StubExtractor(extractor.Extractor): + def __init__(self): + super().__init__(_schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' + bse:author rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + owl:maxCardinality "INF"^^xsd:number . + bse:comment rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + owl:maxCardinality "INF"^^xsd:number . + ''')) + + def extract(self, subject, content, predicates): + raise NotImplementedError() + +class StubSub(StubExtractor): + pass + +class TestExtractor(unittest.TestCase): + def test_essentials(self): + ext = StubExtractor() + self.assertEqual(str(ext), 'StubExtractor') + self.assertEqual(repr(ext), 'StubExtractor()') + self.assertEqual(ext, StubExtractor()) + self.assertEqual(hash(ext), hash(StubExtractor())) + + sub = StubSub() + self.assertEqual(str(sub), 'StubSub') + self.assertEqual(repr(sub), 'StubSub()') + self.assertEqual(sub, StubSub()) + self.assertEqual(hash(sub), hash(StubSub())) + self.assertNotEqual(ext, sub) + self.assertNotEqual(hash(ext), hash(sub)) + + def test_predicates(self): + schema = _schema.Schema.Empty() + entity = schema.node(ns.bsfs.Node).get_child(ns.bsfs.Entity) + string = schema.literal(ns.bsfs.Literal).get_child(URI('http://www.w3.org/2001/XMLSchema#string')) + p_author = schema.predicate(ns.bsfs.Predicate).get_child(ns.bse.author, domain=entity, range=string) + p_comment = schema.predicate(ns.bsfs.Predicate).get_child(ns.bse.comment, domain=entity, range=string) + ext = StubExtractor() + self.assertSetEqual(set(ext.predicates()), {p_author, p_comment} | set(schema.predicates())) + + +## main ## + +if __name__ == '__main__': + unittest.main() + +## EOF ## diff --git a/test/base/test_reader.py b/test/base/test_reader.py new file mode 100644 index 0000000..802b314 --- /dev/null +++ b/test/base/test_reader.py @@ -0,0 +1,45 @@ +""" + +Part of the bsie test suite. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import unittest + +# objects to test +from bsie.base import reader + + +## code ## + +class StubReader(reader.Reader): + def __call__(self, path): + raise NotImplementedError() + +class StubSub(StubReader): + pass + +class TestReader(unittest.TestCase): + def test_essentials(self): + ext = StubReader() + self.assertEqual(str(ext), 'StubReader') + self.assertEqual(repr(ext), 'StubReader()') + self.assertEqual(ext, StubReader()) + self.assertEqual(hash(ext), hash(StubReader())) + + sub = StubSub() + self.assertEqual(str(sub), 'StubSub') + self.assertEqual(repr(sub), 'StubSub()') + self.assertEqual(sub, StubSub()) + self.assertEqual(hash(sub), hash(StubSub())) + self.assertNotEqual(ext, sub) + self.assertNotEqual(hash(ext), hash(sub)) + + +## main ## + +if __name__ == '__main__': + unittest.main() + +## EOF ## diff --git a/test/extractor/generic/test_constant.py b/test/extractor/generic/test_constant.py index f3ab0a3..aa33fb4 100644 --- a/test/extractor/generic/test_constant.py +++ b/test/extractor/generic/test_constant.py @@ -20,39 +20,101 @@ from bsie.extractor.generic.constant import Constant class TestConstant(unittest.TestCase): def test_extract(self): schema = ''' - bse:author a bsfs:Predicate ; + bse:author rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:Entity ; rdfs:range xsd:string ; owl:maxCardinality "1"^^xsd:number . - - bse:comment a bsfs:Predicate ; + bse:comment rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:Entity ; rdfs:range xsd:string ; owl:maxCardinality "INF"^^xsd:number . - ''' tuples = [ (ns.bse.author, 'Me, myself, and I'), (ns.bse.comment, 'the quick brown fox jumps over the lazy dog.'), ] - node = Node(ns.bsfs.Entity, '') # Blank node - predicates = (ns.bse.author, ns.bse.comment) ext = Constant(schema, tuples) + node = Node(ns.bsfs.Entity, '') # Blank node + p_author = ext.schema.predicate(ns.bse.author) + p_comment = ext.schema.predicate(ns.bse.comment) + entity = ext.schema.node(ns.bsfs.Node).get_child(ns.bsfs.Entity) + string = ext.schema.literal(ns.bsfs.Literal).get_child(ns.xsd.string) # baseline - self.assertSetEqual(set(ext.extract(node, None, predicates)), - {(node, pred, value) for pred, value in tuples}) + self.assertSetEqual(set(ext.extract(node, None, (p_author, p_comment))), + {(node, p_author, 'Me, myself, and I'), + (node, p_comment, 'the quick brown fox jumps over the lazy dog.')}) # predicates is respected - self.assertSetEqual(set(ext.extract(node, None, (ns.bse.author, ns.bse.foobar))), - {(node, ns.bse.author, 'Me, myself, and I')}) - self.assertSetEqual(set(ext.extract(node, None, (ns.bse.comment, ns.bse.foobar))), - {(node, ns.bse.comment, 'the quick brown fox jumps over the lazy dog.')}) - self.assertSetEqual(set(ext.extract(node, None, (ns.bse.foobar, ns.bse.barfoo))), set()) - - # FIXME: should change! - # for now: no schema compliance - ext = Constant('', tuples) - self.assertSetEqual(set(ext.extract(node, None, predicates)), - {(node, pred, value) for pred, value in tuples}) + p_foobar = ext.schema.predicate(ns.bsfs.Predicate).get_child(ns.bse.foobar, domain=entity, range=entity) + self.assertSetEqual(set(ext.extract(node, None, (p_author, p_foobar))), + {(node, p_author, 'Me, myself, and I')}) + self.assertSetEqual(set(ext.extract(node, None, (p_comment, p_foobar))), + {(node, p_comment, 'the quick brown fox jumps over the lazy dog.')}) + p_barfoo = ext.schema.predicate(ns.bse.author).get_child(ns.bse.comment, domain=entity, range=string) + self.assertSetEqual(set(ext.extract(node, None, (p_foobar, p_barfoo))), set()) + + def test_construct(self): + # schema compliance + schema = ''' + bse:author rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + owl:maxCardinality "1"^^xsd:number . + bse:comment rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + owl:maxCardinality "INF"^^xsd:number . + ''' + # can create a schema + self.assertIsInstance(Constant(schema, [ + (ns.bse.author, 'Me, myself, and I'), + (ns.bse.comment, 'the quick brown fox jumps over the lazy dog.'), + ]), Constant) + # predicates are validated + self.assertRaises(KeyError, Constant, schema, [ + (ns.bse.author, 'Me, myself, and I'), + (ns.bse.foobar, 'foobar!')]) + # FIXME: values are validated + #class Foo(): pass # not string compatible + #self.assertRaises(ValueError, Constant, schema, [ + # (ns.bse.author, Foo())]) + + def test_eq(self): + schema_a = ''' + bse:author rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + owl:maxCardinality "1"^^xsd:number . + ''' + schema_b = ''' + bse:comment rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + owl:maxCardinality "INF"^^xsd:number . + ''' + tuples_a = [(ns.bse.author, 'Me, myself, and I')] + tuples_b = [(ns.bse.comment, 'the quick brown fox jumps over the lazy dog.') ] + # distinct instances, same data + self.assertEqual( + Constant(schema_a, tuples_a), + Constant(schema_a, tuples_a)) + self.assertEqual( + hash(Constant(schema_a, tuples_a)), + hash(Constant(schema_a, tuples_a))) + # different data + self.assertNotEqual( + Constant(schema_a, tuples_a), + Constant(schema_b, tuples_b)) + self.assertNotEqual( + hash(Constant(schema_a, tuples_a)), + hash(Constant(schema_b, tuples_b))) + # different objects + class Foo(): pass + self.assertNotEqual(Constant(schema_a, tuples_a), Foo()) + self.assertNotEqual(hash(Constant(schema_a, tuples_a)), hash(Foo())) + self.assertNotEqual(Constant(schema_a, tuples_a), 123) + self.assertNotEqual(hash(Constant(schema_a, tuples_a)), hash(123)) + self.assertNotEqual(Constant(schema_a, tuples_a), None) + self.assertNotEqual(hash(Constant(schema_a, tuples_a)), hash(None)) ## main ## diff --git a/test/extractor/generic/test_path.py b/test/extractor/generic/test_path.py index 8623490..9376c7c 100644 --- a/test/extractor/generic/test_path.py +++ b/test/extractor/generic/test_path.py @@ -8,7 +8,9 @@ Author: Matthias Baumgartner, 2022 import unittest # bsie imports +from bsie import base from bsie.utils import ns +from bsie.utils.bsfs import schema from bsie.utils.node import Node # objects to test @@ -18,23 +20,52 @@ from bsie.extractor.generic.path import Path ## code ## class TestPath(unittest.TestCase): + def test_eq(self): + # distinct instances, same data + self.assertEqual(Path(), Path()) + # different classes + class Foo(): pass + self.assertNotEqual(Path(), Foo()) + self.assertNotEqual(Path(), 123) + self.assertNotEqual(Path(), None) + + def test_schema(self): + self.assertEqual(Path().schema, + schema.Schema.from_string(base.extractor.SCHEMA_PREAMBLE + ''' + bse:filename rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + owl:maxCardinality "INF"^^xsd:number . + ''')) + def test_extract(self): - node = Node(ns.bsfs.Entity, '') # Blank node ext = Path() + node = Node(ns.bsfs.Entity, '') # Blank node + content = '/tmp/foo/bar' + p_filename = ext.schema.predicate(ns.bse.filename) + entity = ext.schema.node(ns.bsfs.Node).get_child(ns.bsfs.Entity) + string = ext.schema.literal(ns.bsfs.Literal).get_child(ns.xsd.string) # baseline - self.assertSetEqual(set(ext.extract(node, '/tmp/foo/bar', (ns.bse.filename, ))), - {(node, ns.bse.filename, 'bar')}) + self.assertSetEqual(set(ext.extract(node, content, (p_filename, ))), + {(node, p_filename, 'bar')}) # predicates parameter is respected - self.assertSetEqual(set(ext.extract(node, '/tmp/foo/bar', (ns.bse.filename, ns.bse.foo))), - {(node, ns.bse.filename, 'bar')}) - self.assertSetEqual(set(ext.extract(node, '/tmp/foo/bar', (ns.bse.foo, ))), set()) + p_foo = ext.schema.predicate(ns.bsfs.Predicate).get_child(ns.bse.foo, domain=entity, range=string) # unsupported predicate + self.assertSetEqual(set(ext.extract(node, content, (p_filename, p_foo))), + {(node, p_filename, 'bar')}) + self.assertSetEqual(set(ext.extract(node, content, (p_foo, ))), set()) + # predicates are validated + p_bar = p_foo.get_child(ns.bse.filename) # same URI but different hierarchy + self.assertSetEqual(set(ext.extract(node, content, (p_filename, p_bar))), + {(node, p_filename, 'bar')}) + self.assertSetEqual(set(ext.extract(node, content, (p_bar, ))), set()) # path variations - self.assertSetEqual(set(ext.extract(node, 'bar', (ns.bse.filename, ))), - {(node, ns.bse.filename, 'bar')}) - self.assertSetEqual(set(ext.extract(node, '', (ns.bse.filename, ))), - {(node, ns.bse.filename, '')}) - self.assertSetEqual(set(ext.extract(node, None, (ns.bse.filename, ))), set()) + self.assertSetEqual(set(ext.extract(node, 'bar', (p_filename, ))), + {(node, p_filename, 'bar')}) + self.assertSetEqual(set(ext.extract(node, '', (p_filename, ))), + {(node, p_filename, '')}) + # errors are suppressed + self.assertSetEqual(set(ext.extract(node, None, (p_filename, ))), set()) ## main ## diff --git a/test/extractor/generic/test_stat.py b/test/extractor/generic/test_stat.py index f89b053..26dad6a 100644 --- a/test/extractor/generic/test_stat.py +++ b/test/extractor/generic/test_stat.py @@ -9,7 +9,9 @@ import os import unittest # bsie imports +from bsie import base from bsie.utils import ns +from bsie.utils.bsfs import schema from bsie.utils.node import Node # objects to test @@ -18,21 +20,51 @@ from bsie.extractor.generic.stat import Stat ## code ## -class TestConstant(unittest.TestCase): +class TestStat(unittest.TestCase): + def test_eq(self): + # distinct instances, same data + self.assertEqual(Stat(), Stat()) + # different classes + class Foo(): pass + self.assertNotEqual(Stat(), Foo()) + self.assertNotEqual(Stat(), 123) + self.assertNotEqual(Stat(), None) + + def test_schema(self): + self.assertEqual(Stat().schema, + schema.Schema.from_string(base.extractor.SCHEMA_PREAMBLE + ''' + bse:filesize rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:integer ; + owl:maxCardinality "INF"^^xsd:number . + ''')) + def test_extract(self): + ext = Stat() node = Node(ns.bsfs.Entity, '') # Blank node content = os.stat(__file__) - ext = Stat() + p_filesize = ext.schema.predicate(ns.bse.filesize) + entity = ext.schema.node(ns.bsfs.Node).get_child(ns.bsfs.Entity) + string = ext.schema.literal(ns.bsfs.Literal).get_child(ns.xsd.string) # baseline - self.assertSetEqual(set(ext.extract(node, content, (ns.bse.filesize, ))), - {(node, ns.bse.filesize, content.st_size)}) + self.assertSetEqual(set(ext.extract(node, content, (p_filesize, ))), + {(node, p_filesize, content.st_size)}) # predicates parameter is respected - self.assertSetEqual(set(ext.extract(node, content, (ns.bse.filesize, ns.bse.foo))), - {(node, ns.bse.filesize, content.st_size)}) - self.assertSetEqual(set(ext.extract(node, content, (ns.bse.foo, ))), set()) + p_foo = ext.schema.predicate(ns.bsfs.Predicate).get_child(ns.bse.foo, domain=entity, range=string) # unsupported predicate + self.assertSetEqual(set(ext.extract(node, content, (p_filesize, p_foo))), + {(node, p_filesize, content.st_size)}) + self.assertSetEqual(set(ext.extract(node, content, (p_foo, ))), set()) + # predicates are validated + p_bar = p_foo.get_child(ns.bse.filesizse) # same URI but different hierarchy + self.assertSetEqual(set(ext.extract(node, content, (p_filesize, p_bar))), + {(node, p_filesize, content.st_size)}) + self.assertSetEqual(set(ext.extract(node, content, (p_bar, ))), set()) # content variations - self.assertSetEqual(set(ext.extract(node, None, (ns.bse.filesize, ))), set()) + self.assertSetEqual(set(ext.extract(node, os.stat_result([12345] * len(content)), (p_filesize, p_bar))), + {(node, p_filesize, 12345)}) + # errors are suppressed + self.assertSetEqual(set(ext.extract(node, None, (p_filesize, ))), set()) ## main ## diff --git a/test/tools/__init__.py b/test/tools/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/test/tools/__init__.py diff --git a/test/tools/test_builder.py b/test/tools/test_builder.py new file mode 100644 index 0000000..bef0e9d --- /dev/null +++ b/test/tools/test_builder.py @@ -0,0 +1,247 @@ +""" + +Part of the bsie test suite. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import logging +import unittest + +# bsie imports +from bsie import base +from bsie.base import errors +from bsie.utils.bsfs import URI + +# objects to test +from bsie.tools.builder import ExtractorBuilder +from bsie.tools.builder import PipelineBuilder +from bsie.tools.builder import ReaderBuilder +from bsie.tools.builder import _safe_load +from bsie.tools.builder import _unpack_name + + +## code ## + +class TestUtils(unittest.TestCase): + def test_safe_load(self): + # invalid module + self.assertRaises(errors.LoaderError, _safe_load, 'dBGHMSAYOoKeKMpywDoKZQycENFPvN', 'foobar') + self.assertRaises(errors.LoaderError, _safe_load, 'dBGHMSAYOoKeKMpywDoKZQycENFPvN.bar', 'foobar') + # partially valid module + self.assertRaises(errors.LoaderError, _safe_load, 'os.foo', 'foobar') + # invalid class + self.assertRaises(errors.LoaderError, _safe_load, 'os.path', 'foo') + # valid module and class + cls = _safe_load('collections.abc', 'Container') + import collections.abc + self.assertEqual(cls, collections.abc.Container) + + def test_unpack_name(self): + self.assertRaises(TypeError, _unpack_name, 123) + self.assertRaises(TypeError, _unpack_name, None) + self.assertRaises(ValueError, _unpack_name, '') + self.assertRaises(ValueError, _unpack_name, 'path') + self.assertRaises(ValueError, _unpack_name, '.Path') + self.assertEqual(_unpack_name('path.Path'), ('path', 'Path')) + self.assertEqual(_unpack_name('path.foo.bar.Path'), ('path.foo.bar', 'Path')) + + +class TestReaderBuilder(unittest.TestCase): + def test_build(self): + builder = ReaderBuilder({'bsie.reader.path.Path': {}}) + # build configured reader + cls = builder.build('bsie.reader.path.Path') + import bsie.reader.path + self.assertIsInstance(cls, bsie.reader.path.Path) + # build unconfigured reader + cls = builder.build('bsie.reader.stat.Stat') + import bsie.reader.stat + self.assertIsInstance(cls, bsie.reader.stat.Stat) + # re-build previous reader (test cache) + self.assertEqual(cls, builder.build('bsie.reader.stat.Stat')) + # test invalid + self.assertRaises(TypeError, builder.build, 123) + self.assertRaises(TypeError, builder.build, None) + self.assertRaises(ValueError, builder.build, '') + self.assertRaises(ValueError, builder.build, 'Path') + self.assertRaises(errors.BuilderError, builder.build, 'path.Path') + # invalid config + builder = ReaderBuilder({'bsie.reader.stat.Stat': dict(foo=123)}) + self.assertRaises(errors.BuilderError, builder.build, 'bsie.reader.stat.Stat') + builder = ReaderBuilder({'bsie.reader.stat.Stat': 123}) + self.assertRaises(TypeError, builder.build, 'bsie.reader.stat.Stat') + # no instructions + builder = ReaderBuilder({}) + cls = builder.build('bsie.reader.stat.Stat') + self.assertIsInstance(cls, bsie.reader.stat.Stat) + + + +class TestExtractorBuilder(unittest.TestCase): + def test_iter(self): + # no specifications + self.assertListEqual(list(ExtractorBuilder([])), []) + # some specifications + builder = ExtractorBuilder([ + {'bsie.extractor.generic.path.Path': {}}, + {'bsie.extractor.generic.stat.Stat': {}}, + {'bsie.extractor.generic.path.Path': {}}, + ]) + self.assertListEqual(list(builder), [0, 1, 2]) + + def test_build(self): + # simple and repeated extractors + builder = ExtractorBuilder([ + {'bsie.extractor.generic.path.Path': {}}, + {'bsie.extractor.generic.stat.Stat': {}}, + {'bsie.extractor.generic.path.Path': {}}, + ]) + ext = [builder.build(0), builder.build(1), builder.build(2)] + import bsie.extractor.generic.path + import bsie.extractor.generic.stat + self.assertListEqual(ext, [ + bsie.extractor.generic.path.Path(), + bsie.extractor.generic.stat.Stat(), + bsie.extractor.generic.path.Path(), + ]) + # out-of-bounds raises KeyError + self.assertRaises(IndexError, builder.build, 3) + + # building with args + builder = ExtractorBuilder([ + {'bsie.extractor.generic.constant.Constant': { + 'schema': ''' + bse:author rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + owl:maxCardinality "1"^^xsd:number . + bse:rating rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:integer ; + owl:maxCardinality "1"^^xsd:number . + ''', + 'tuples': [ + ('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I'), + ('http://bsfs.ai/schema/Entity#rating', 123), + ], + }}]) + obj = builder.build(0) + import bsie.extractor.generic.constant + self.assertEqual(obj, bsie.extractor.generic.constant.Constant(''' + bse:author rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + owl:maxCardinality "1"^^xsd:number . + bse:rating rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:integer ; + owl:maxCardinality "1"^^xsd:number . + ''', [ + ('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I'), + ('http://bsfs.ai/schema/Entity#rating', 123), + ])) + + # building with invalid args + self.assertRaises(errors.BuilderError, ExtractorBuilder( + [{'bsie.extractor.generic.path.Path': {'foo': 123}}]).build, 0) + # non-dict build specification + self.assertRaises(TypeError, ExtractorBuilder( + [('bsie.extractor.generic.path.Path', {})]).build, 0) + # multiple keys per build specification + self.assertRaises(TypeError, ExtractorBuilder( + [{'bsie.extractor.generic.path.Path': {}, + 'bsie.extractor.generic.stat.Stat': {}}]).build, 0) + # non-dict value for kwargs + self.assertRaises(TypeError, ExtractorBuilder( + [{'bsie.extractor.generic.path.Path': 123}]).build, 0) + + + + +class TestPipelineBuilder(unittest.TestCase): + def test_build(self): + prefix = URI('http://example.com/local/file#') + c_schema = ''' + bse:author rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + owl:maxCardinality "1"^^xsd:number . + ''' + c_tuples = [('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')] + # prepare builders + rbuild = ReaderBuilder({}) + ebuild = ExtractorBuilder([ + {'bsie.extractor.generic.path.Path': {}}, + {'bsie.extractor.generic.stat.Stat': {}}, + {'bsie.extractor.generic.constant.Constant': dict( + schema=c_schema, + tuples=c_tuples, + )}, + ]) + # build pipeline + builder = PipelineBuilder(prefix, rbuild, ebuild) + pipeline = builder.build() + # delayed import + import bsie.reader.path + import bsie.reader.stat + import bsie.extractor.generic.path + import bsie.extractor.generic.stat + import bsie.extractor.generic.constant + # check pipeline + self.assertDictEqual(pipeline._ext2rdr, { + bsie.extractor.generic.path.Path(): bsie.reader.path.Path(), + bsie.extractor.generic.stat.Stat(): bsie.reader.stat.Stat(), + bsie.extractor.generic.constant.Constant(c_schema, c_tuples): None, + }) + + # fail to load extractor + ebuild_err = ExtractorBuilder([ + {'bsie.extractor.generic.foo.Foo': {}}, + {'bsie.extractor.generic.path.Path': {}}, + ]) + with self.assertLogs(logging.getLogger('bsie.tools.builder'), logging.ERROR): + pipeline = PipelineBuilder(prefix, rbuild, ebuild_err).build() + self.assertDictEqual(pipeline._ext2rdr, { + bsie.extractor.generic.path.Path(): bsie.reader.path.Path()}) + + # fail to build extractor + ebuild_err = ExtractorBuilder([ + {'bsie.extractor.generic.path.Path': {'foo': 123}}, + {'bsie.extractor.generic.path.Path': {}}, + ]) + with self.assertLogs(logging.getLogger('bsie.tools.builder'), logging.ERROR): + pipeline = PipelineBuilder(prefix, rbuild, ebuild_err).build() + self.assertDictEqual(pipeline._ext2rdr, { + bsie.extractor.generic.path.Path(): bsie.reader.path.Path()}) + + # fail to load reader + with self.assertLogs(logging.getLogger('bsie.tools.builder'), logging.ERROR): + # switch reader of an extractor + old_reader = bsie.extractor.generic.path.Path.CONTENT_READER + bsie.extractor.generic.path.Path.CONTENT_READER = 'bsie.reader.foo.Foo' + # build pipeline with invalid reader reference + pipeline = PipelineBuilder(prefix, rbuild, ebuild).build() + self.assertDictEqual(pipeline._ext2rdr, { + bsie.extractor.generic.stat.Stat(): bsie.reader.stat.Stat(), + bsie.extractor.generic.constant.Constant(c_schema, c_tuples): None, + }) + # switch back + bsie.extractor.generic.path.Path.CONTENT_READER = old_reader + + # fail to build reader + rbuild_err = ReaderBuilder({'bsie.reader.stat.Stat': dict(foo=123)}) + with self.assertLogs(logging.getLogger('bsie.tools.builder'), logging.ERROR): + pipeline = PipelineBuilder(prefix, rbuild_err, ebuild).build() + self.assertDictEqual(pipeline._ext2rdr, { + bsie.extractor.generic.path.Path(): bsie.reader.path.Path(), + bsie.extractor.generic.constant.Constant(c_schema, c_tuples): None, + }) + + +## main ## + +if __name__ == '__main__': + unittest.main() + +## EOF ## diff --git a/test/tools/test_pipeline.py b/test/tools/test_pipeline.py new file mode 100644 index 0000000..9888d2e --- /dev/null +++ b/test/tools/test_pipeline.py @@ -0,0 +1,167 @@ +""" + +Part of the bsie test suite. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import logging +import os +import unittest + +# bsie imports +from bsie.base import errors +from bsie.utils import ns +from bsie.utils.bsfs import URI +from bsie.utils.node import Node +import bsie.extractor.generic.constant +import bsie.extractor.generic.path +import bsie.extractor.generic.stat +import bsie.reader.path +import bsie.reader.stat + +# objects to test +from bsie.tools.pipeline import Pipeline + + +## code ## + +class TestPipeline(unittest.TestCase): + def setUp(self): + # constant A + csA = ''' + bse:author rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + owl:maxCardinality "1"^^xsd:number . + ''' + tupA = [('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')] + # constant B + csB = ''' + bse:rating rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:integer ; + owl:maxCardinality "1"^^xsd:number . + ''' + tupB = [('http://bsfs.ai/schema/Entity#rating', 123)] + # extractors/readers + self.ext2rdr = { + bsie.extractor.generic.path.Path(): bsie.reader.path.Path(), + bsie.extractor.generic.stat.Stat(): bsie.reader.stat.Stat(), + bsie.extractor.generic.constant.Constant(csA, tupA): None, + bsie.extractor.generic.constant.Constant(csB, tupB): None, + } + self.prefix = URI('http://example.com/local/file#') + + def test_essentials(self): + pipeline = Pipeline(self.prefix, self.ext2rdr) + self.assertEqual(str(pipeline), 'Pipeline') + self.assertEqual(repr(pipeline), 'Pipeline(...)') + + def test_equality(self): + pipeline = Pipeline(self.prefix, self.ext2rdr) + # a pipeline is equivalent to itself + self.assertEqual(pipeline, pipeline) + self.assertEqual(hash(pipeline), hash(pipeline)) + # identical builds are equivalent + self.assertEqual(pipeline, Pipeline(self.prefix, self.ext2rdr)) + self.assertEqual(hash(pipeline), hash(Pipeline(self.prefix, self.ext2rdr))) + + # equivalence respects prefix + self.assertNotEqual(pipeline, Pipeline(URI('http://example.com/global/ent#'), self.ext2rdr)) + self.assertNotEqual(hash(pipeline), hash(Pipeline(URI('http://example.com/global/ent#'), self.ext2rdr))) + # equivalence respects extractors/readers + ext2rdr = {ext: rdr for idx, (ext, rdr) in enumerate(self.ext2rdr.items()) if idx % 2 == 0} + self.assertNotEqual(pipeline, Pipeline(self.prefix, ext2rdr)) + self.assertNotEqual(hash(pipeline), hash(Pipeline(self.prefix, ext2rdr))) + + # equivalence respects schema + p2 = Pipeline(self.prefix, self.ext2rdr) + p2.schema = pipeline.schema.Empty() + self.assertNotEqual(pipeline, p2) + self.assertNotEqual(hash(pipeline), hash(p2)) + + # not equal to other types + class Foo(): pass + self.assertNotEqual(pipeline, Foo()) + self.assertNotEqual(hash(pipeline), hash(Foo())) + self.assertNotEqual(pipeline, 123) + self.assertNotEqual(hash(pipeline), hash(123)) + self.assertNotEqual(pipeline, None) + self.assertNotEqual(hash(pipeline), hash(None)) + + + def test_call(self): + # build pipeline + pipeline = Pipeline(self.prefix, self.ext2rdr) + # build objects for tests + content_hash = 'e3bb4ab54e4a50d75626a1f76814f152f4edc60a82ad724aa2aa922ca5534427' + subject = Node(ns.bsfs.Entity, self.prefix + content_hash) + testfile = os.path.join(os.path.dirname(__file__), 'testfile.t') + p_filename = pipeline.schema.predicate(ns.bse.filename) + p_filesize = pipeline.schema.predicate(ns.bse.filesize) + p_author = pipeline.schema.predicate(ns.bse.author) + p_rating = pipeline.schema.predicate(ns.bse.rating) + entity = pipeline.schema.node(ns.bsfs.Entity) + p_invalid = pipeline.schema.predicate(ns.bsfs.Predicate).get_child(ns.bse.foo, range=entity) + + # extract given predicates + self.assertSetEqual(set(pipeline(testfile, {p_filename, p_filesize})), { + (subject, p_filename, 'testfile.t'), + (subject, p_filesize, 11), + }) + self.assertSetEqual(set(pipeline(testfile, {p_author})), { + (subject, p_author, 'Me, myself, and I'), + }) + self.assertSetEqual(set(pipeline(testfile, {p_filename})), { + (subject, p_filename, 'testfile.t'), + }) + self.assertSetEqual(set(pipeline(testfile, {p_filesize})), { + (subject, p_filesize, 11), + }) + # extract all predicates + self.assertSetEqual(set(pipeline(testfile)), { + (subject, p_filename, 'testfile.t'), + (subject, p_filesize, 11), + (subject, p_author, 'Me, myself, and I'), + (subject, p_rating, 123), + }) + # invalid predicate + self.assertSetEqual(set(pipeline(testfile, {p_invalid})), set()) + # valid/invalid predicates mixed + self.assertSetEqual(set(pipeline(testfile, {p_filename, p_invalid})), { + (subject, p_filename, 'testfile.t'), + }) + # invalid path + self.assertRaises(FileNotFoundError, list, pipeline('inexistent_file')) + # FIXME: unreadable file (e.g. permissions error) + + def test_call_reader_err(self): + class FaultyReader(bsie.reader.path.Path): + def __call__(self, path): + raise errors.ReaderError('reader error') + + pipeline = Pipeline(self.prefix, {bsie.extractor.generic.path.Path(): FaultyReader()}) + with self.assertLogs(logging.getLogger('bsie.tools.pipeline'), logging.ERROR): + testfile = os.path.join(os.path.dirname(__file__), 'testfile.t') + p_filename = pipeline.schema.predicate(ns.bse.filename) + self.assertSetEqual(set(pipeline(testfile, {p_filename})), set()) + + def test_call_extractor_err(self): + class FaultyExtractor(bsie.extractor.generic.path.Path): + def extract(self, subject, content, predicates): + raise errors.ExtractorError('extractor error') + + pipeline = Pipeline(self.prefix, {FaultyExtractor(): bsie.reader.path.Path()}) + with self.assertLogs(logging.getLogger('bsie.tools.pipeline'), logging.ERROR): + testfile = os.path.join(os.path.dirname(__file__), 'testfile.t') + p_filename = pipeline.schema.predicate(ns.bse.filename) + self.assertSetEqual(set(pipeline(testfile, {p_filename})), set()) + + +## main ## + +if __name__ == '__main__': + unittest.main() + +## EOF ## diff --git a/test/tools/testfile.t b/test/tools/testfile.t new file mode 100644 index 0000000..58bf1b8 --- /dev/null +++ b/test/tools/testfile.t @@ -0,0 +1 @@ +hello worl diff --git a/test/utils/__init__.py b/test/utils/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/test/utils/__init__.py diff --git a/test/utils/test_node.py b/test/utils/test_node.py new file mode 100644 index 0000000..826f199 --- /dev/null +++ b/test/utils/test_node.py @@ -0,0 +1,66 @@ +""" + +Part of the bsie test suite. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import unittest + +# bsie imports +from bsie.utils.bsfs import URI +from bsie.utils import ns + +# objects to test +from bsie.utils.node import Node + + +## code ## + +class TestNode(unittest.TestCase): + def test_equality(self): + uri = URI('http://example.com/me/entity#1234') + node = Node(ns.bsfs.Entity, uri) + # basic equivalence + self.assertEqual(node, Node(ns.bsfs.Entity, URI('http://example.com/me/entity#1234'))) + self.assertEqual(hash(node), hash(Node(ns.bsfs.Entity, URI('http://example.com/me/entity#1234')))) + # equality respects uri + self.assertNotEqual(node, Node(ns.bsfs.Entity, URI('http://example.com/me/entity#4321'))) + self.assertNotEqual(hash(node), hash(Node(ns.bsfs.Entity, URI('http://example.com/me/entity#4321')))) + # equality respects node_type + self.assertNotEqual(node, Node(ns.bsfs.Foo, uri)) + self.assertNotEqual(hash(node), hash(Node(ns.bsfs.Foo, uri))) + # not equal to other types + self.assertNotEqual(node, 1234) + self.assertNotEqual(hash(node), hash(1234)) + self.assertNotEqual(node, uri) + self.assertNotEqual(hash(node), hash(uri)) + self.assertNotEqual(node, ns.bsfs.Entity) + self.assertNotEqual(hash(node), hash(ns.bsfs.Entity)) + class Foo(): pass + self.assertNotEqual(node, Foo()) + self.assertNotEqual(hash(node), hash(Foo())) + + def test_str(self): + uri = URI('http://example.com/me/entity#1234') + # basic string conversion + node = Node(ns.bsfs.Entity, uri) + self.assertEqual(str(node), 'Node(http://bsfs.ai/schema/Entity, http://example.com/me/entity#1234)') + self.assertEqual(repr(node), 'Node(http://bsfs.ai/schema/Entity, http://example.com/me/entity#1234)') + # string conversion respects node_type + node = Node(ns.bsfs.Foo, uri) + self.assertEqual(str(node), 'Node(http://bsfs.ai/schema/Foo, http://example.com/me/entity#1234)') + self.assertEqual(repr(node), 'Node(http://bsfs.ai/schema/Foo, http://example.com/me/entity#1234)') + # string conversion respects uri + node = Node(ns.bsfs.Entity, URI('http://example.com/me/entity#4321')) + self.assertEqual(str(node), 'Node(http://bsfs.ai/schema/Entity, http://example.com/me/entity#4321)') + self.assertEqual(repr(node), 'Node(http://bsfs.ai/schema/Entity, http://example.com/me/entity#4321)') + + + +## main ## + +if __name__ == '__main__': + unittest.main() + +## EOF ## |