aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/extractor/base.py
diff options
context:
space:
mode:
authorMatthias Baumgartner <dev@igsor.net>2023-03-05 19:22:58 +0100
committerMatthias Baumgartner <dev@igsor.net>2023-03-05 19:22:58 +0100
commita35b33f4f1ddcf6f1bb8ab0f41b87bf2b847f11d (patch)
treefb220da28bb7248ebf37ce09af5de88f2c1aaad4 /bsie/extractor/base.py
parent7582c280ad5324a2f0427999911c7e7abc14a6ab (diff)
parentaf81318ae9311fd0b0e16949cef3cfaf7996970b (diff)
downloadbsie-main.tar.gz
bsie-main.tar.bz2
bsie-main.zip
Merge branch 'develop'HEADv0.23.03releasemain
Diffstat (limited to 'bsie/extractor/base.py')
-rw-r--r--bsie/extractor/base.py116
1 files changed, 116 insertions, 0 deletions
diff --git a/bsie/extractor/base.py b/bsie/extractor/base.py
new file mode 100644
index 0000000..f92d7cc
--- /dev/null
+++ b/bsie/extractor/base.py
@@ -0,0 +1,116 @@
+"""The Extractor classes transform content into triples.
+"""
+# standard imports
+import abc
+import typing
+
+# bsie imports
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Extractor',
+ )
+
+# constants
+
+# essential definitions typically used in extractor schemas.
+# NOTE: This preamble is only for convenience; Each Extractor must implement its use, if so desired.
+SCHEMA_PREAMBLE = '''
+ # common external prefixes
+ prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+ prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+ prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+ prefix schema: <http://schema.org/>
+
+ # common bsfs prefixes
+ prefix bsfs: <https://schema.bsfs.io/core/>
+ prefix bsl: <https://schema.bsfs.io/core/Literal/>
+ prefix bsa: <https://schema.bsfs.io/core/Literal/Array/>
+ prefix bsd: <https://schema.bsfs.io/core/distance#>
+
+ prefix bsie: <https://schema.bsfs.io/ie/>
+ prefix bsn: <https://schema.bsfs.io/ie/Node/>
+ prefix bse: <https://schema.bsfs.io/ie/Node/Entity#>
+ prefix bsp: <https://schema.bsfs.io/ie/Node/Preview#>
+
+ # default definitions
+ bsl:Array rdfs:subClassOf bsfs:Literal .
+ bsl:Number rdfs:subClassOf bsfs:Literal .
+ bsl:Time rdfs:subClassOf bsfs:Literal .
+ bsa:Feature rdfs:subClassOf bsl:Array ;
+ bsfs:dimension "1"^^xsd:integer ;
+ bsfs:dtype <https://schema.bsfs.io/core/dtype#f16> ;
+ bsfs:distance bsd:euclidean .
+
+ # essential nodes
+ bsn:Entity rdfs:subClassOf bsfs:Node .
+
+ # common definitions
+ xsd:string rdfs:subClassOf bsfs:Literal .
+ xsd:integer rdfs:subClassOf bsl:Number .
+ xsd:float rdfs:subClassOf bsl:Number .
+
+ '''
+
+
+## code ##
+
+class Extractor(abc.ABC):
+ """Produce (subject, predicate, value)-triples from some content.
+ The Extractor produces princpal predicates that provide information
+ about the content itself (i.e., triples that include the subject),
+ and may also generate triples with auxiliary predicates if the
+ extracted value is a node itself.
+ """
+
+ # what type of content is expected (i.e. reader subclass).
+ CONTENT_READER: typing.Optional[str] = None
+
+ # extractor schema.
+ _schema: bsfs.schema.Schema
+
+ def __init__(self, schema: bsfs.schema.Schema):
+ self._schema = schema
+
+ def __str__(self) -> str:
+ return bsfs.typename(self)
+
+ def __repr__(self) -> str:
+ return f'{bsfs.typename(self)}()'
+
+ def __eq__(self, other: typing.Any) -> bool:
+ return isinstance(other, type(self)) \
+ and self.CONTENT_READER == other.CONTENT_READER \
+ and self.schema == other.schema
+
+ def __hash__(self) -> int:
+ return hash((type(self), self.CONTENT_READER, self.schema))
+
+ @property
+ def schema(self) -> bsfs.schema.Schema:
+ """Return the extractor's schema."""
+ return self._schema
+
+ @property
+ def principals(self) -> typing.Iterator[bsfs.schema.Predicate]:
+ """Return the principal predicates, i.e., relations from/to the extraction subject."""
+ ent = self.schema.node(ns.bsn.Entity)
+ return (
+ pred
+ for pred
+ in self.schema.predicates()
+ if pred.domain <= ent or (pred.range is not None and pred.range <= ent)
+ )
+
+ @abc.abstractmethod
+ def extract(
+ self,
+ subject: node.Node,
+ content: typing.Any,
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ """Return (node, predicate, value) triples."""
+ # FIXME: type annotation could be more strict: value is Hashable
+
+## EOF ##