aboutsummaryrefslogtreecommitdiffstats
path: root/bsie
diff options
context:
space:
mode:
authorMatthias Baumgartner <dev@igsor.net>2022-11-25 14:31:29 +0100
committerMatthias Baumgartner <dev@igsor.net>2022-11-25 14:31:29 +0100
commite174a25585e64eb1b0759440cad48d642dd31829 (patch)
treefadee735ef922156ba4a67506154c26fab2ecdd5 /bsie
parent9389c741bdbbca9adbff6099d440706cd63deac4 (diff)
downloadbsie-e174a25585e64eb1b0759440cad48d642dd31829.tar.gz
bsie-e174a25585e64eb1b0759440cad48d642dd31829.tar.bz2
bsie-e174a25585e64eb1b0759440cad48d642dd31829.zip
use schema and predicate types in extractors
Diffstat (limited to 'bsie')
-rw-r--r--bsie/base/errors.py13
-rw-r--r--bsie/base/extractor.py51
-rw-r--r--bsie/extractor/generic/constant.py20
-rw-r--r--bsie/extractor/generic/path.py40
-rw-r--r--bsie/extractor/generic/stat.py34
-rw-r--r--bsie/utils/bsfs.py2
-rw-r--r--bsie/utils/namespaces.py3
-rw-r--r--bsie/utils/node.py2
8 files changed, 109 insertions, 56 deletions
diff --git a/bsie/base/errors.py b/bsie/base/errors.py
index f86ffb2..eedce3b 100644
--- a/bsie/base/errors.py
+++ b/bsie/base/errors.py
@@ -8,15 +8,22 @@ Author: Matthias Baumgartner, 2022
import typing
# exports
-__all__: typing.Sequence[str] = []
+__all__: typing.Sequence[str] = (
+ 'ExtractorError',
+ )
+
+
## code ##
-class _BSIE_Error(Exception):
+class _BSIEError(Exception):
"""Generic BSIE error."""
-class ReaderError(_BSIE_Error):
+class ExtractorError(_BSIEError):
+ """The Extractor failed to process the given content."""
+
+class ReaderError(_BSIEError):
"""The Reader failed to read the given file."""
## EOF ##
diff --git a/bsie/base/extractor.py b/bsie/base/extractor.py
index ea43925..a6a69c6 100644
--- a/bsie/base/extractor.py
+++ b/bsie/base/extractor.py
@@ -11,13 +11,38 @@ import typing
# inner-module imports
from . import reader
from bsie.utils import node
-from bsie.utils.bsfs import URI, typename
+from bsie.utils.bsfs import schema as _schema, typename
# exports
__all__: typing.Sequence[str] = (
'Extractor',
)
+# constants
+
+# essential definitions typically used in extractor schemas.
+# NOTE: The definition here is only for convenience; Each Extractor must implement its use, if so desired.
+SCHEMA_PREAMBLE = '''
+ # common external prefixes
+ prefix owl: <http://www.w3.org/2002/07/owl#>
+ prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+ prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+ prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+ prefix schema: <http://schema.org/>
+
+ # common bsfs prefixes
+ prefix bsfs: <http://bsfs.ai/schema/>
+ prefix bse: <http://bsfs.ai/schema/Entity#>
+
+ # essential nodes
+ bsfs:Entity rdfs:subClassOf bsfs:Node .
+
+ # common definitions
+ xsd:string rdfs:subClassOf bsfs:Literal .
+ xsd:integer rdfs:subClassOf bsfs:Literal .
+
+ '''
+
## code ##
@@ -27,23 +52,37 @@ class Extractor(abc.ABC):
# what type of content is expected (i.e. reader subclass).
CONTENT_READER: typing.Optional[typing.Type[reader.Reader]] = None
+ # extractor schema.
+ schema: _schema.Schema
+
+ def __init__(self, schema: _schema.Schema):
+ self.schema = schema
+
def __str__(self) -> str:
return typename(self)
def __repr__(self) -> str:
return f'{typename(self)}()'
- @abc.abstractmethod
- def schema(self) -> str:
- """Return the schema (predicates and nodes) produced by this Extractor."""
+
+ def predicates(self) -> typing.Iterator[_schema.Predicate]:
+ """Return the predicates that may be part of extracted triples."""
+ # NOTE: Some predicates in the schema might not occur in actual triples,
+ # but are defined due to predicate class hierarchy. E.g., bsfs:Predicate
+ # is part of every schema but should not be used in triples.
+ # Announcing all predicates might not be the most efficient way, however,
+ # it is the most safe one. Concrete extractors that produce additional
+ # predicates (e.g. auxiliary nodes with their own predicates) should
+ # overwrite this method to only include the principal predicates.
+ return self.schema.predicates()
@abc.abstractmethod
def extract(
self,
subject: node.Node,
content: typing.Any,
- predicates: typing.Iterable[URI],
- ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]:
+ predicates: typing.Iterable[_schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, _schema.Predicate, typing.Any]]:
"""Return (node, predicate, value) triples."""
## EOF ##
diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py
index e243131..795bac6 100644
--- a/bsie/extractor/generic/constant.py
+++ b/bsie/extractor/generic/constant.py
@@ -7,9 +7,9 @@ Author: Matthias Baumgartner, 2022
# imports
import typing
-# inner-module imports
+# bsie imports
from bsie.base import extractor
-from bsie.utils.bsfs import URI
+from bsie.utils.bsfs import URI, schema as _schema
from bsie.utils.node import Node
# exports
@@ -25,26 +25,26 @@ class Constant(extractor.Extractor):
CONTENT_READER = None
+ # predicate/value pairs to be produced.
+ _tuples: typing.Tuple[typing.Tuple[_schema.Predicate, typing.Any], ...]
+
def __init__(
self,
schema: str,
tuples: typing.Iterable[typing.Tuple[URI, typing.Any]],
):
- self._schema = schema
- self._tuples = tuples
- # FIXME: use schema instance for predicate checking
- #self._tuples = [(pred, value) for pred, value in tuples if pred in schema]
+ super().__init__(_schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + schema))
+ # NOTE: Raises a KeyError if the predicate is not part of the schema
+ self._tuples = tuple((self.schema.predicate(p_uri), value) for p_uri, value in tuples)
# FIXME: use schema instance for value checking
- def schema(self) -> str:
- return self._schema
def extract(
self,
subject: Node,
content: None,
- predicates: typing.Iterable[URI],
- ) -> typing.Iterator[typing.Tuple[Node, URI, typing.Any]]:
+ predicates: typing.Iterable[_schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[Node, _schema.Predicate, typing.Any]]:
for pred, value in self._tuples:
if pred in predicates:
yield subject, pred, value
diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py
index c39bbd2..f358a79 100644
--- a/bsie/extractor/generic/path.py
+++ b/bsie/extractor/generic/path.py
@@ -8,11 +8,10 @@ Author: Matthias Baumgartner, 2022
import os
import typing
-# inner-module imports
+# bsie imports
from bsie.base import extractor
from bsie.utils import node, ns
-from bsie.utils.bsfs import URI
-import bsie.reader.path
+from bsie.utils.bsfs import schema
# exports
__all__: typing.Sequence[str] = (
@@ -27,30 +26,31 @@ class Path(extractor.Extractor):
CONTENT_READER = bsie.reader.path.Path
- def __init__(self):
- self.__callmap = {
- ns.bse.filename: self.__filename,
- }
+ # mapping from predicate to handler function.
+ _callmap: typing.Dict[schema.Predicate, typing.Callable[[str], typing.Any]]
- def schema(self) -> str:
- return '''
- bse:filename a bsfs:Predicate ;
+ def __init__(self):
+ super().__init__(schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:filename rdfs:subClassOf bsfs:Predicate ;
rdfs:domain bsfs:Entity ;
rdfs:range xsd:string ;
- rdf:label "File name"^^xsd:string ;
+ rdfs:label "File name"^^xsd:string ;
schema:description "Filename of entity in some filesystem."^^xsd:string ;
owl:maxCardinality "INF"^^xsd:number .
- '''
+ '''))
+ self._callmap = {
+ self.schema.predicate(ns.bse.filename): self.__filename,
+ }
def extract(
self,
subject: node.Node,
content: CONTENT_READER.CONTENT_TYPE,
- predicates: typing.Iterable[URI],
- ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]:
+ predicates: typing.Iterable[schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, schema.Predicate, typing.Any]]:
for pred in predicates:
# find callback
- clbk = self.__callmap.get(pred)
+ clbk = self._callmap.get(pred)
if clbk is None:
continue
# get value
@@ -60,11 +60,15 @@ class Path(extractor.Extractor):
# produce triple
yield subject, pred, value
- def __filename(self, path: str) -> str:
+ def __filename(self, path: str) -> typing.Optional[str]:
try:
return os.path.basename(path)
- except Exception:
- # FIXME: some kind of error reporting (e.g. logging)
+ except Exception: # some error, skip.
+ # FIXME: some kind of error reporting (e.g. logging)?
+ # Options: (a) Fail silently (current); (b) Skip and report to log;
+ # (c) Raise ExtractorError (aborts extraction); (d) separate content type
+ # checks from basename errors (report content type errors, skip basename
+ # errors)
return None
## EOF ##
diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py
index d74369c..e5387af 100644
--- a/bsie/extractor/generic/stat.py
+++ b/bsie/extractor/generic/stat.py
@@ -5,14 +5,13 @@ A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# imports
+import os
import typing
-# inner-module imports
+# bsie imports
from bsie.base import extractor
from bsie.utils import node, ns
-from bsie.utils.bsfs import URI
-import bsie.reader.stat
-
+from bsie.utils.bsfs import schema as _schema
# exports
__all__: typing.Sequence[str] = (
@@ -27,30 +26,31 @@ class Stat(extractor.Extractor):
CONTENT_READER = bsie.reader.stat.Stat
- def __init__(self):
- self.__callmap = {
- ns.bse.filesize: self.__filesize,
- }
+ # mapping from predicate to handler function.
+ _callmap: typing.Dict[_schema.Predicate, typing.Callable[[os.stat_result], typing.Any]]
- def schema(self) -> str:
- return '''
- bse:filesize a bsfs:Predicate ;
+ def __init__(self):
+ super().__init__(_schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:filesize rdfs:subClassOf bsfs:Predicate ;
rdfs:domain bsfs:Entity ;
rdfs:range xsd:integer ;
- rdf:label "File size"^^xsd:string ;
+ rdfs:label "File size"^^xsd:string ;
schema:description "File size of entity in some filesystem."^^xsd:string ;
owl:maxCardinality "INF"^^xsd:number .
- '''
+ '''))
+ self._callmap = {
+ self.schema.predicate(ns.bse.filesize): self.__filesize,
+ }
def extract(
self,
subject: node.Node,
content: CONTENT_READER.CONTENT_TYPE,
- predicates: typing.Iterable[URI],
- ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]:
+ predicates: typing.Iterable[_schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, _schema.Predicate, typing.Any]]:
for pred in predicates:
# find callback
- clbk = self.__callmap.get(pred)
+ clbk = self._callmap.get(pred)
if clbk is None:
continue
# get value
@@ -60,7 +60,7 @@ class Stat(extractor.Extractor):
# produce triple
yield subject, pred, value
- def __filesize(self, content: CONTENT_READER.CONTENT_TYPE) -> int:
+ def __filesize(self, content: os.stat_result) -> typing.Optional[int]:
"""Return the file size."""
try:
return content.st_size
diff --git a/bsie/utils/bsfs.py b/bsie/utils/bsfs.py
index 1ae657c..01ec5d1 100644
--- a/bsie/utils/bsfs.py
+++ b/bsie/utils/bsfs.py
@@ -8,6 +8,7 @@ Author: Matthias Baumgartner, 2022
import typing
# bsfs imports
+from bsfs import schema
from bsfs.namespace import Namespace
from bsfs.utils import URI, typename
@@ -15,6 +16,7 @@ from bsfs.utils import URI, typename
__all__: typing.Sequence[str] = (
'Namespace',
'URI',
+ 'schema',
'typename',
)
diff --git a/bsie/utils/namespaces.py b/bsie/utils/namespaces.py
index 67ccc71..13be96b 100644
--- a/bsie/utils/namespaces.py
+++ b/bsie/utils/namespaces.py
@@ -7,13 +7,14 @@ Author: Matthias Baumgartner, 2022
# imports
import typing
-# bsie imports
+# inner-module imports
from . import bsfs as _bsfs
# constants
bse = _bsfs.Namespace('http://bsfs.ai/schema/Entity#')
bsfs = _bsfs.Namespace('http://bsfs.ai/schema/')
bsm = _bsfs.Namespace('http://bsfs.ai/schema/meta#')
+xsd = _bsfs.Namespace('http://www.w3.org/2001/XMLSchema#')
# export
__all__: typing.Sequence[str] = (
diff --git a/bsie/utils/node.py b/bsie/utils/node.py
index 60863a4..3a0f06b 100644
--- a/bsie/utils/node.py
+++ b/bsie/utils/node.py
@@ -12,7 +12,7 @@ from bsie.utils.bsfs import URI
# exports
__all__: typing.Sequence[str] = (
- 'Node'
+ 'Node',
)