aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/extractor/generic
diff options
context:
space:
mode:
Diffstat (limited to 'bsie/extractor/generic')
-rw-r--r--bsie/extractor/generic/constant.py26
-rw-r--r--bsie/extractor/generic/path.py44
-rw-r--r--bsie/extractor/generic/stat.py38
3 files changed, 59 insertions, 49 deletions
diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py
index e243131..7da792a 100644
--- a/bsie/extractor/generic/constant.py
+++ b/bsie/extractor/generic/constant.py
@@ -7,9 +7,9 @@ Author: Matthias Baumgartner, 2022
# imports
import typing
-# inner-module imports
+# bsie imports
from bsie.base import extractor
-from bsie.utils.bsfs import URI
+from bsie.utils.bsfs import URI, schema as _schema
from bsie.utils.node import Node
# exports
@@ -25,26 +25,32 @@ class Constant(extractor.Extractor):
CONTENT_READER = None
+ # predicate/value pairs to be produced.
+ _tuples: typing.Tuple[typing.Tuple[_schema.Predicate, typing.Any], ...]
+
def __init__(
self,
schema: str,
tuples: typing.Iterable[typing.Tuple[URI, typing.Any]],
):
- self._schema = schema
- self._tuples = tuples
- # FIXME: use schema instance for predicate checking
- #self._tuples = [(pred, value) for pred, value in tuples if pred in schema]
+ super().__init__(_schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + schema))
+ # NOTE: Raises a KeyError if the predicate is not part of the schema
+ self._tuples = tuple((self.schema.predicate(p_uri), value) for p_uri, value in tuples)
# FIXME: use schema instance for value checking
- def schema(self) -> str:
- return self._schema
+ def __eq__(self, other: typing.Any) -> bool:
+ return super().__eq__(other) \
+ and self._tuples == other._tuples
+
+ def __hash__(self) -> int:
+ return hash((super().__hash__(), self._tuples))
def extract(
self,
subject: Node,
content: None,
- predicates: typing.Iterable[URI],
- ) -> typing.Iterator[typing.Tuple[Node, URI, typing.Any]]:
+ predicates: typing.Iterable[_schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[Node, _schema.Predicate, typing.Any]]:
for pred, value in self._tuples:
if pred in predicates:
yield subject, pred, value
diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py
index c39bbd2..f346f97 100644
--- a/bsie/extractor/generic/path.py
+++ b/bsie/extractor/generic/path.py
@@ -8,11 +8,10 @@ Author: Matthias Baumgartner, 2022
import os
import typing
-# inner-module imports
+# bsie imports
from bsie.base import extractor
from bsie.utils import node, ns
-from bsie.utils.bsfs import URI
-import bsie.reader.path
+from bsie.utils.bsfs import schema
# exports
__all__: typing.Sequence[str] = (
@@ -25,32 +24,33 @@ __all__: typing.Sequence[str] = (
class Path(extractor.Extractor):
"""Extract information from file's path."""
- CONTENT_READER = bsie.reader.path.Path
+ CONTENT_READER = 'bsie.reader.path.Path'
- def __init__(self):
- self.__callmap = {
- ns.bse.filename: self.__filename,
- }
+ # mapping from predicate to handler function.
+ _callmap: typing.Dict[schema.Predicate, typing.Callable[[str], typing.Any]]
- def schema(self) -> str:
- return '''
- bse:filename a bsfs:Predicate ;
+ def __init__(self):
+ super().__init__(schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:filename rdfs:subClassOf bsfs:Predicate ;
rdfs:domain bsfs:Entity ;
rdfs:range xsd:string ;
- rdf:label "File name"^^xsd:string ;
+ rdfs:label "File name"^^xsd:string ;
schema:description "Filename of entity in some filesystem."^^xsd:string ;
owl:maxCardinality "INF"^^xsd:number .
- '''
+ '''))
+ self._callmap = {
+ self.schema.predicate(ns.bse.filename): self.__filename,
+ }
def extract(
self,
subject: node.Node,
- content: CONTENT_READER.CONTENT_TYPE,
- predicates: typing.Iterable[URI],
- ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]:
+ content: str,
+ predicates: typing.Iterable[schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, schema.Predicate, typing.Any]]:
for pred in predicates:
# find callback
- clbk = self.__callmap.get(pred)
+ clbk = self._callmap.get(pred)
if clbk is None:
continue
# get value
@@ -60,11 +60,15 @@ class Path(extractor.Extractor):
# produce triple
yield subject, pred, value
- def __filename(self, path: str) -> str:
+ def __filename(self, path: str) -> typing.Optional[str]:
try:
return os.path.basename(path)
- except Exception:
- # FIXME: some kind of error reporting (e.g. logging)
+ except Exception: # some error, skip.
+ # FIXME: some kind of error reporting (e.g. logging)?
+ # Options: (a) Fail silently (current); (b) Skip and report to log;
+ # (c) Raise ExtractorError (aborts extraction); (d) separate content type
+ # checks from basename errors (report content type errors, skip basename
+ # errors)
return None
## EOF ##
diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py
index d74369c..7088c0a 100644
--- a/bsie/extractor/generic/stat.py
+++ b/bsie/extractor/generic/stat.py
@@ -5,14 +5,13 @@ A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# imports
+import os
import typing
-# inner-module imports
+# bsie imports
from bsie.base import extractor
from bsie.utils import node, ns
-from bsie.utils.bsfs import URI
-import bsie.reader.stat
-
+from bsie.utils.bsfs import schema as _schema
# exports
__all__: typing.Sequence[str] = (
@@ -25,32 +24,33 @@ __all__: typing.Sequence[str] = (
class Stat(extractor.Extractor):
"""Extract information from the file system."""
- CONTENT_READER = bsie.reader.stat.Stat
+ CONTENT_READER = 'bsie.reader.stat.Stat'
- def __init__(self):
- self.__callmap = {
- ns.bse.filesize: self.__filesize,
- }
+ # mapping from predicate to handler function.
+ _callmap: typing.Dict[_schema.Predicate, typing.Callable[[os.stat_result], typing.Any]]
- def schema(self) -> str:
- return '''
- bse:filesize a bsfs:Predicate ;
+ def __init__(self):
+ super().__init__(_schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:filesize rdfs:subClassOf bsfs:Predicate ;
rdfs:domain bsfs:Entity ;
rdfs:range xsd:integer ;
- rdf:label "File size"^^xsd:string ;
+ rdfs:label "File size"^^xsd:string ;
schema:description "File size of entity in some filesystem."^^xsd:string ;
owl:maxCardinality "INF"^^xsd:number .
- '''
+ '''))
+ self._callmap = {
+ self.schema.predicate(ns.bse.filesize): self.__filesize,
+ }
def extract(
self,
subject: node.Node,
- content: CONTENT_READER.CONTENT_TYPE,
- predicates: typing.Iterable[URI],
- ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]:
+ content: os.stat_result,
+ predicates: typing.Iterable[_schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, _schema.Predicate, typing.Any]]:
for pred in predicates:
# find callback
- clbk = self.__callmap.get(pred)
+ clbk = self._callmap.get(pred)
if clbk is None:
continue
# get value
@@ -60,7 +60,7 @@ class Stat(extractor.Extractor):
# produce triple
yield subject, pred, value
- def __filesize(self, content: CONTENT_READER.CONTENT_TYPE) -> int:
+ def __filesize(self, content: os.stat_result) -> typing.Optional[int]:
"""Return the file size."""
try:
return content.st_size