3 files changed, 79 insertions, 21 deletions
diff --git a/bsie/base/errors.py b/bsie/base/errors.py
index f86ffb2..760351f 100644
--- a/bsie/base/errors.py
+++ b/bsie/base/errors.py
@@ -8,15 +8,29 @@ Author: Matthias Baumgartner, 2022
 import typing
 
 # exports
-__all__: typing.Sequence[str] = []
+__all__: typing.Sequence[str] = (
+    'BuilderError',
+    'ExtractorError',
+    'LoaderError',
+    'ReaderError',
+    )
 
 
 ## code ##
 
-class _BSIE_Error(Exception):
+class _BSIEError(Exception):
     """Generic BSIE error."""
 
-class ReaderError(_BSIE_Error):
+class BuilderError(_BSIEError):
+    """The Builder failed to create an instance."""
+
+class LoaderError(BuilderError):
+    """Failed to load a module or class."""
+
+class ExtractorError(_BSIEError):
+    """The Extractor failed to process the given content."""
+
+class ReaderError(_BSIEError):
     """The Reader failed to read the given file."""
 
 ## EOF ##
diff --git a/bsie/base/extractor.py b/bsie/base/extractor.py
index ea43925..2fc4f18 100644
--- a/bsie/base/extractor.py
+++ b/bsie/base/extractor.py
@@ -8,16 +8,40 @@ Author: Matthias Baumgartner, 2022
 import abc
 import typing
 
-# inner-module imports
-from . import reader
+# bsie imports
 from bsie.utils import node
-from bsie.utils.bsfs import URI, typename
+from bsie.utils.bsfs import schema as _schema, typename
 
 # exports
 __all__: typing.Sequence[str] = (
     'Extractor',
     )
 
+# constants
+
+# essential definitions typically used in extractor schemas.
+# NOTE: The definition here is only for convenience; Each Extractor must implement its use, if so desired.
+SCHEMA_PREAMBLE = '''
+    # common external prefixes
+    prefix owl: <http://www.w3.org/2002/07/owl#>
+    prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+    prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+    prefix schema: <http://schema.org/>
+
+    # common bsfs prefixes
+    prefix bsfs: <http://bsfs.ai/schema/>
+    prefix bse: <http://bsfs.ai/schema/Entity#>
+
+    # essential nodes
+    bsfs:Entity rdfs:subClassOf bsfs:Node .
+
+    # common definitions
+    xsd:string rdfs:subClassOf bsfs:Literal .
+    xsd:integer rdfs:subClassOf bsfs:Literal .
+
+    '''
+
 
 ## code ##
 
@@ -25,7 +49,13 @@ class Extractor(abc.ABC):
     """Produce (node, predicate, value)-triples from some content."""
 
     # what type of content is expected (i.e. reader subclass).
-    CONTENT_READER: typing.Optional[typing.Type[reader.Reader]] = None
+    CONTENT_READER: typing.Optional[str] = None
+
+    # extractor schema.
+    schema: _schema.Schema
+
+    def __init__(self, schema: _schema.Schema):
+        self.schema = schema
 
     def __str__(self) -> str:
         return typename(self)
@@ -33,17 +63,32 @@ class Extractor(abc.ABC):
     def __repr__(self) -> str:
         return f'{typename(self)}()'
 
-    @abc.abstractmethod
-    def schema(self) -> str:
-        """Return the schema (predicates and nodes) produced by this Extractor."""
+    def __eq__(self, other: typing.Any) -> bool:
+        return isinstance(other, type(self)) \
+          and self.CONTENT_READER == other.CONTENT_READER \
+          and self.schema == other.schema
+
+    def __hash__(self) -> int:
+        return hash((type(self), self.CONTENT_READER, self.schema))
+
+    def predicates(self) -> typing.Iterator[_schema.Predicate]:
+        """Return the predicates that may be part of extracted triples."""
+        # NOTE: Some predicates in the schema might not occur in actual triples,
+        # but are defined due to predicate class hierarchy. E.g., bsfs:Predicate
+        # is part of every schema but should not be used in triples.
+        # Announcing all predicates might not be the most efficient way, however,
+        # it is the most safe one. Concrete extractors that produce additional
+        # predicates (e.g. auxiliary nodes with their own predicates) should
+        # overwrite this method to only include the principal predicates.
+        return self.schema.predicates()
 
     @abc.abstractmethod
     def extract(
             self,
             subject: node.Node,
             content: typing.Any,
-            predicates: typing.Iterable[URI],
-            ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]:
+            predicates: typing.Iterable[_schema.Predicate],
+            ) -> typing.Iterator[typing.Tuple[node.Node, _schema.Predicate, typing.Any]]:
         """Return (node, predicate, value) triples."""
 
 ## EOF ##
diff --git a/bsie/base/reader.py b/bsie/base/reader.py
index f29e451..b7eabf7 100644
--- a/bsie/base/reader.py
+++ b/bsie/base/reader.py
@@ -12,12 +12,11 @@ Author: Matthias Baumgartner, 2022
 import abc
 import typing
 
-# inner-module imports
+# bsie imports
 from bsie.utils.bsfs import URI, typename
 
 # exports
 __all__: typing.Sequence[str] = (
-    'Aggregator',
     'Reader',
     )
 
@@ -27,20 +26,20 @@ __all__: typing.Sequence[str] = (
 class Reader(abc.ABC):
     """Read and return some content from a file."""
 
-    # In what data structure content is returned
-    CONTENT_TYPE = typing.Union[typing.Any]
-    # NOTE: Child classes must also assign a typing.Union even if there's
-    # only one options
-
     def __str__(self) -> str:
         return typename(self)
 
     def __repr__(self) -> str:
         return f'{typename(self)}()'
 
-    # FIXME: How about using contexts instead of calls?
+    def __eq__(self, other: typing.Any) -> bool:
+        return isinstance(other, type(self))
+
+    def __hash__(self) -> int:
+        return hash(type(self))
+
     @abc.abstractmethod
-    def __call__(self, path: URI) -> CONTENT_TYPE:
+    def __call__(self, path: URI) -> typing.Any:
         """Return some content of the file at *path*.
         Raises a `ReaderError` if the reader cannot make sense of the file format.
         """