aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/base
diff options
context:
space:
mode:
authorMatthias Baumgartner <dev@igsor.net>2022-11-25 14:31:29 +0100
committerMatthias Baumgartner <dev@igsor.net>2022-11-25 14:31:29 +0100
commite174a25585e64eb1b0759440cad48d642dd31829 (patch)
treefadee735ef922156ba4a67506154c26fab2ecdd5 /bsie/base
parent9389c741bdbbca9adbff6099d440706cd63deac4 (diff)
downloadbsie-e174a25585e64eb1b0759440cad48d642dd31829.tar.gz
bsie-e174a25585e64eb1b0759440cad48d642dd31829.tar.bz2
bsie-e174a25585e64eb1b0759440cad48d642dd31829.zip
use schema and predicate types in extractors
Diffstat (limited to 'bsie/base')
-rw-r--r--bsie/base/errors.py13
-rw-r--r--bsie/base/extractor.py51
2 files changed, 55 insertions, 9 deletions
diff --git a/bsie/base/errors.py b/bsie/base/errors.py
index f86ffb2..eedce3b 100644
--- a/bsie/base/errors.py
+++ b/bsie/base/errors.py
@@ -8,15 +8,22 @@ Author: Matthias Baumgartner, 2022
import typing
# exports
-__all__: typing.Sequence[str] = []
+__all__: typing.Sequence[str] = (
+ 'ExtractorError',
+ )
+
+
## code ##
-class _BSIE_Error(Exception):
+class _BSIEError(Exception):
"""Generic BSIE error."""
-class ReaderError(_BSIE_Error):
+class ExtractorError(_BSIEError):
+ """The Extractor failed to process the given content."""
+
+class ReaderError(_BSIEError):
"""The Reader failed to read the given file."""
## EOF ##
diff --git a/bsie/base/extractor.py b/bsie/base/extractor.py
index ea43925..a6a69c6 100644
--- a/bsie/base/extractor.py
+++ b/bsie/base/extractor.py
@@ -11,13 +11,38 @@ import typing
# inner-module imports
from . import reader
from bsie.utils import node
-from bsie.utils.bsfs import URI, typename
+from bsie.utils.bsfs import schema as _schema, typename
# exports
__all__: typing.Sequence[str] = (
'Extractor',
)
+# constants
+
+# essential definitions typically used in extractor schemas.
+# NOTE: The definition here is only for convenience; Each Extractor must implement its use, if so desired.
+SCHEMA_PREAMBLE = '''
+ # common external prefixes
+ prefix owl: <http://www.w3.org/2002/07/owl#>
+ prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+ prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+ prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+ prefix schema: <http://schema.org/>
+
+ # common bsfs prefixes
+ prefix bsfs: <http://bsfs.ai/schema/>
+ prefix bse: <http://bsfs.ai/schema/Entity#>
+
+ # essential nodes
+ bsfs:Entity rdfs:subClassOf bsfs:Node .
+
+ # common definitions
+ xsd:string rdfs:subClassOf bsfs:Literal .
+ xsd:integer rdfs:subClassOf bsfs:Literal .
+
+ '''
+
## code ##
@@ -27,23 +52,37 @@ class Extractor(abc.ABC):
# what type of content is expected (i.e. reader subclass).
CONTENT_READER: typing.Optional[typing.Type[reader.Reader]] = None
+ # extractor schema.
+ schema: _schema.Schema
+
+ def __init__(self, schema: _schema.Schema):
+ self.schema = schema
+
def __str__(self) -> str:
return typename(self)
def __repr__(self) -> str:
return f'{typename(self)}()'
- @abc.abstractmethod
- def schema(self) -> str:
- """Return the schema (predicates and nodes) produced by this Extractor."""
+
+ def predicates(self) -> typing.Iterator[_schema.Predicate]:
+ """Return the predicates that may be part of extracted triples."""
+ # NOTE: Some predicates in the schema might not occur in actual triples,
+ # but are defined due to predicate class hierarchy. E.g., bsfs:Predicate
+ # is part of every schema but should not be used in triples.
+ # Announcing all predicates might not be the most efficient way, however,
+ # it is the most safe one. Concrete extractors that produce additional
+ # predicates (e.g. auxiliary nodes with their own predicates) should
+ # overwrite this method to only include the principal predicates.
+ return self.schema.predicates()
@abc.abstractmethod
def extract(
self,
subject: node.Node,
content: typing.Any,
- predicates: typing.Iterable[URI],
- ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]:
+ predicates: typing.Iterable[_schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, _schema.Predicate, typing.Any]]:
"""Return (node, predicate, value) triples."""
## EOF ##