1 files changed, 113 insertions, 0 deletions
diff --git a/bsie/extractor/base.py b/bsie/extractor/base.py
new file mode 100644
index 0000000..7401244
--- /dev/null
+++ b/bsie/extractor/base.py
@@ -0,0 +1,113 @@
+"""The Extractor classes transform content into triples.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# standard imports
+import abc
+import typing
+
+# bsie imports
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Extractor',
+    )
+
+# constants
+
+# essential definitions typically used in extractor schemas.
+# NOTE: This preamble is only for convenience; Each Extractor must implement its use, if so desired.
+SCHEMA_PREAMBLE = '''
+    # common external prefixes
+    prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+    prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+    prefix schema: <http://schema.org/>
+
+    # common bsfs prefixes
+    prefix bsfs: <http://bsfs.ai/schema/>
+    prefix bse: <http://bsfs.ai/schema/Entity#>
+
+    # default definitions
+    bsfs:Array rdfs:subClassOf bsfs:Literal .
+    bsfs:Number rdfs:subClassOf bsfs:Literal .
+    bsfs:Time rdfs:subClassOf bsfs:Literal .
+    bsfs:Feature rdfs:subClassOf bsfs:Array ;
+        bsfs:dimension "1"^^xsd:integer ;
+        bsfs:dtype bsfs:f16 ;
+        bsfs:distance bsfs:euclidean .
+
+    # essential nodes
+    bsfs:Entity rdfs:subClassOf bsfs:Node .
+    bsfs:File rdfs:subClassOf bsfs:Entity .
+
+    # common definitions
+    xsd:string rdfs:subClassOf bsfs:Literal .
+    xsd:integer rdfs:subClassOf bsfs:Number .
+
+    '''
+
+
+## code ##
+
+class Extractor(abc.ABC):
+    """Produce (subject, predicate, value)-triples from some content.
+    The Extractor produces princpal predicates that provide information
+    about the content itself (i.e., triples that include the subject),
+    and may also generate triples with auxiliary predicates if the
+    extracted value is a node itself.
+    """
+
+    # what type of content is expected (i.e. reader subclass).
+    CONTENT_READER: typing.Optional[str] = None
+
+    # extractor schema.
+    _schema: bsfs.schema.Schema
+
+    def __init__(self, schema: bsfs.schema.Schema):
+        self._schema = schema
+
+    def __str__(self) -> str:
+        return bsfs.typename(self)
+
+    def __repr__(self) -> str:
+        return f'{bsfs.typename(self)}()'
+
+    def __eq__(self, other: typing.Any) -> bool:
+        return isinstance(other, type(self)) \
+          and self.CONTENT_READER == other.CONTENT_READER \
+          and self.schema == other.schema
+
+    def __hash__(self) -> int:
+        return hash((type(self), self.CONTENT_READER, self.schema))
+
+    @property
+    def schema(self) -> bsfs.schema.Schema:
+        """Return the extractor's schema."""
+        return self._schema
+
+    @property
+    def principals(self) -> typing.Iterator[bsfs.schema.Predicate]:
+        """Return the principal predicates, i.e., relations from/to the extraction subject."""
+        ent = self.schema.node(ns.bsfs.Entity)
+        return (
+            pred
+            for pred
+            in self.schema.predicates()
+            if pred.domain <= ent or (pred.range is not None and pred.range <= ent)
+            )
+
+    @abc.abstractmethod
+    def extract(
+            self,
+            subject: node.Node,
+            content: typing.Any,
+            principals: typing.Iterable[bsfs.schema.Predicate],
+            ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+        """Return (node, predicate, value) triples."""
+        # FIXME: type annotation could be more strict: value is Hashable
+
+## EOF ##