diff options
author | Matthias Baumgartner <dev@igsor.net> | 2022-11-25 14:59:17 +0100 |
---|---|---|
committer | Matthias Baumgartner <dev@igsor.net> | 2022-11-25 14:59:17 +0100 |
commit | a294bbe0622911bcd6df37c38865a4c0eb290593 (patch) | |
tree | f038ed8d4f04c63991939e13e61ae170de4e2c57 /bsie/base | |
parent | 9389c741bdbbca9adbff6099d440706cd63deac4 (diff) | |
parent | 3e6a69ce7f109f0fd4352507ad60d58d4cbd24a7 (diff) | |
download | bsie-a294bbe0622911bcd6df37c38865a4c0eb290593.tar.gz bsie-a294bbe0622911bcd6df37c38865a4c0eb290593.tar.bz2 bsie-a294bbe0622911bcd6df37c38865a4c0eb290593.zip |
Merge branch 'mb/tools' into develop
Diffstat (limited to 'bsie/base')
-rw-r--r-- | bsie/base/errors.py | 20 | ||||
-rw-r--r-- | bsie/base/extractor.py | 63 | ||||
-rw-r--r-- | bsie/base/reader.py | 17 |
3 files changed, 79 insertions, 21 deletions
diff --git a/bsie/base/errors.py b/bsie/base/errors.py index f86ffb2..760351f 100644 --- a/bsie/base/errors.py +++ b/bsie/base/errors.py @@ -8,15 +8,29 @@ Author: Matthias Baumgartner, 2022 import typing # exports -__all__: typing.Sequence[str] = [] +__all__: typing.Sequence[str] = ( + 'BuilderError', + 'ExtractorError', + 'LoaderError', + 'ReaderError', + ) ## code ## -class _BSIE_Error(Exception): +class _BSIEError(Exception): """Generic BSIE error.""" -class ReaderError(_BSIE_Error): +class BuilderError(_BSIEError): + """The Builder failed to create an instance.""" + +class LoaderError(BuilderError): + """Failed to load a module or class.""" + +class ExtractorError(_BSIEError): + """The Extractor failed to process the given content.""" + +class ReaderError(_BSIEError): """The Reader failed to read the given file.""" ## EOF ## diff --git a/bsie/base/extractor.py b/bsie/base/extractor.py index ea43925..2fc4f18 100644 --- a/bsie/base/extractor.py +++ b/bsie/base/extractor.py @@ -8,16 +8,40 @@ Author: Matthias Baumgartner, 2022 import abc import typing -# inner-module imports -from . import reader +# bsie imports from bsie.utils import node -from bsie.utils.bsfs import URI, typename +from bsie.utils.bsfs import schema as _schema, typename # exports __all__: typing.Sequence[str] = ( 'Extractor', ) +# constants + +# essential definitions typically used in extractor schemas. +# NOTE: The definition here is only for convenience; Each Extractor must implement its use, if so desired. +SCHEMA_PREAMBLE = ''' + # common external prefixes + prefix owl: <http://www.w3.org/2002/07/owl#> + prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> + prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> + prefix xsd: <http://www.w3.org/2001/XMLSchema#> + prefix schema: <http://schema.org/> + + # common bsfs prefixes + prefix bsfs: <http://bsfs.ai/schema/> + prefix bse: <http://bsfs.ai/schema/Entity#> + + # essential nodes + bsfs:Entity rdfs:subClassOf bsfs:Node . + + # common definitions + xsd:string rdfs:subClassOf bsfs:Literal . + xsd:integer rdfs:subClassOf bsfs:Literal . + + ''' + ## code ## @@ -25,7 +49,13 @@ class Extractor(abc.ABC): """Produce (node, predicate, value)-triples from some content.""" # what type of content is expected (i.e. reader subclass). - CONTENT_READER: typing.Optional[typing.Type[reader.Reader]] = None + CONTENT_READER: typing.Optional[str] = None + + # extractor schema. + schema: _schema.Schema + + def __init__(self, schema: _schema.Schema): + self.schema = schema def __str__(self) -> str: return typename(self) @@ -33,17 +63,32 @@ class Extractor(abc.ABC): def __repr__(self) -> str: return f'{typename(self)}()' - @abc.abstractmethod - def schema(self) -> str: - """Return the schema (predicates and nodes) produced by this Extractor.""" + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) \ + and self.CONTENT_READER == other.CONTENT_READER \ + and self.schema == other.schema + + def __hash__(self) -> int: + return hash((type(self), self.CONTENT_READER, self.schema)) + + def predicates(self) -> typing.Iterator[_schema.Predicate]: + """Return the predicates that may be part of extracted triples.""" + # NOTE: Some predicates in the schema might not occur in actual triples, + # but are defined due to predicate class hierarchy. E.g., bsfs:Predicate + # is part of every schema but should not be used in triples. + # Announcing all predicates might not be the most efficient way, however, + # it is the most safe one. Concrete extractors that produce additional + # predicates (e.g. auxiliary nodes with their own predicates) should + # overwrite this method to only include the principal predicates. + return self.schema.predicates() @abc.abstractmethod def extract( self, subject: node.Node, content: typing.Any, - predicates: typing.Iterable[URI], - ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]: + predicates: typing.Iterable[_schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, _schema.Predicate, typing.Any]]: """Return (node, predicate, value) triples.""" ## EOF ## diff --git a/bsie/base/reader.py b/bsie/base/reader.py index f29e451..b7eabf7 100644 --- a/bsie/base/reader.py +++ b/bsie/base/reader.py @@ -12,12 +12,11 @@ Author: Matthias Baumgartner, 2022 import abc import typing -# inner-module imports +# bsie imports from bsie.utils.bsfs import URI, typename # exports __all__: typing.Sequence[str] = ( - 'Aggregator', 'Reader', ) @@ -27,20 +26,20 @@ __all__: typing.Sequence[str] = ( class Reader(abc.ABC): """Read and return some content from a file.""" - # In what data structure content is returned - CONTENT_TYPE = typing.Union[typing.Any] - # NOTE: Child classes must also assign a typing.Union even if there's - # only one options - def __str__(self) -> str: return typename(self) def __repr__(self) -> str: return f'{typename(self)}()' - # FIXME: How about using contexts instead of calls? + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) + + def __hash__(self) -> int: + return hash(type(self)) + @abc.abstractmethod - def __call__(self, path: URI) -> CONTENT_TYPE: + def __call__(self, path: URI) -> typing.Any: """Return some content of the file at *path*. Raises a `ReaderError` if the reader cannot make sense of the file format. """ |