aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/base
diff options
context:
space:
mode:
Diffstat (limited to 'bsie/base')
-rw-r--r--bsie/base/errors.py20
-rw-r--r--bsie/base/extractor.py63
-rw-r--r--bsie/base/reader.py17
3 files changed, 79 insertions, 21 deletions
diff --git a/bsie/base/errors.py b/bsie/base/errors.py
index f86ffb2..760351f 100644
--- a/bsie/base/errors.py
+++ b/bsie/base/errors.py
@@ -8,15 +8,29 @@ Author: Matthias Baumgartner, 2022
import typing
# exports
-__all__: typing.Sequence[str] = []
+__all__: typing.Sequence[str] = (
+ 'BuilderError',
+ 'ExtractorError',
+ 'LoaderError',
+ 'ReaderError',
+ )
## code ##
-class _BSIE_Error(Exception):
+class _BSIEError(Exception):
"""Generic BSIE error."""
-class ReaderError(_BSIE_Error):
+class BuilderError(_BSIEError):
+ """The Builder failed to create an instance."""
+
+class LoaderError(BuilderError):
+ """Failed to load a module or class."""
+
+class ExtractorError(_BSIEError):
+ """The Extractor failed to process the given content."""
+
+class ReaderError(_BSIEError):
"""The Reader failed to read the given file."""
## EOF ##
diff --git a/bsie/base/extractor.py b/bsie/base/extractor.py
index ea43925..2fc4f18 100644
--- a/bsie/base/extractor.py
+++ b/bsie/base/extractor.py
@@ -8,16 +8,40 @@ Author: Matthias Baumgartner, 2022
import abc
import typing
-# inner-module imports
-from . import reader
+# bsie imports
from bsie.utils import node
-from bsie.utils.bsfs import URI, typename
+from bsie.utils.bsfs import schema as _schema, typename
# exports
__all__: typing.Sequence[str] = (
'Extractor',
)
+# constants
+
+# essential definitions typically used in extractor schemas.
+# NOTE: The definition here is only for convenience; Each Extractor must implement its use, if so desired.
+SCHEMA_PREAMBLE = '''
+ # common external prefixes
+ prefix owl: <http://www.w3.org/2002/07/owl#>
+ prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+ prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+ prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+ prefix schema: <http://schema.org/>
+
+ # common bsfs prefixes
+ prefix bsfs: <http://bsfs.ai/schema/>
+ prefix bse: <http://bsfs.ai/schema/Entity#>
+
+ # essential nodes
+ bsfs:Entity rdfs:subClassOf bsfs:Node .
+
+ # common definitions
+ xsd:string rdfs:subClassOf bsfs:Literal .
+ xsd:integer rdfs:subClassOf bsfs:Literal .
+
+ '''
+
## code ##
@@ -25,7 +49,13 @@ class Extractor(abc.ABC):
"""Produce (node, predicate, value)-triples from some content."""
# what type of content is expected (i.e. reader subclass).
- CONTENT_READER: typing.Optional[typing.Type[reader.Reader]] = None
+ CONTENT_READER: typing.Optional[str] = None
+
+ # extractor schema.
+ schema: _schema.Schema
+
+ def __init__(self, schema: _schema.Schema):
+ self.schema = schema
def __str__(self) -> str:
return typename(self)
@@ -33,17 +63,32 @@ class Extractor(abc.ABC):
def __repr__(self) -> str:
return f'{typename(self)}()'
- @abc.abstractmethod
- def schema(self) -> str:
- """Return the schema (predicates and nodes) produced by this Extractor."""
+ def __eq__(self, other: typing.Any) -> bool:
+ return isinstance(other, type(self)) \
+ and self.CONTENT_READER == other.CONTENT_READER \
+ and self.schema == other.schema
+
+ def __hash__(self) -> int:
+ return hash((type(self), self.CONTENT_READER, self.schema))
+
+ def predicates(self) -> typing.Iterator[_schema.Predicate]:
+ """Return the predicates that may be part of extracted triples."""
+ # NOTE: Some predicates in the schema might not occur in actual triples,
+ # but are defined due to predicate class hierarchy. E.g., bsfs:Predicate
+ # is part of every schema but should not be used in triples.
+ # Announcing all predicates might not be the most efficient way, however,
+ # it is the most safe one. Concrete extractors that produce additional
+ # predicates (e.g. auxiliary nodes with their own predicates) should
+ # overwrite this method to only include the principal predicates.
+ return self.schema.predicates()
@abc.abstractmethod
def extract(
self,
subject: node.Node,
content: typing.Any,
- predicates: typing.Iterable[URI],
- ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]:
+ predicates: typing.Iterable[_schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, _schema.Predicate, typing.Any]]:
"""Return (node, predicate, value) triples."""
## EOF ##
diff --git a/bsie/base/reader.py b/bsie/base/reader.py
index f29e451..b7eabf7 100644
--- a/bsie/base/reader.py
+++ b/bsie/base/reader.py
@@ -12,12 +12,11 @@ Author: Matthias Baumgartner, 2022
import abc
import typing
-# inner-module imports
+# bsie imports
from bsie.utils.bsfs import URI, typename
# exports
__all__: typing.Sequence[str] = (
- 'Aggregator',
'Reader',
)
@@ -27,20 +26,20 @@ __all__: typing.Sequence[str] = (
class Reader(abc.ABC):
"""Read and return some content from a file."""
- # In what data structure content is returned
- CONTENT_TYPE = typing.Union[typing.Any]
- # NOTE: Child classes must also assign a typing.Union even if there's
- # only one options
-
def __str__(self) -> str:
return typename(self)
def __repr__(self) -> str:
return f'{typename(self)}()'
- # FIXME: How about using contexts instead of calls?
+ def __eq__(self, other: typing.Any) -> bool:
+ return isinstance(other, type(self))
+
+ def __hash__(self) -> int:
+ return hash(type(self))
+
@abc.abstractmethod
- def __call__(self, path: URI) -> CONTENT_TYPE:
+ def __call__(self, path: URI) -> typing.Any:
"""Return some content of the file at *path*.
Raises a `ReaderError` if the reader cannot make sense of the file format.
"""