aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'bsie/extractor')
-rw-r--r--bsie/extractor/__init__.py15
-rw-r--r--bsie/extractor/generic/__init__.py16
-rw-r--r--bsie/extractor/generic/constant.py57
-rw-r--r--bsie/extractor/generic/path.py74
-rw-r--r--bsie/extractor/generic/stat.py70
5 files changed, 232 insertions, 0 deletions
diff --git a/bsie/extractor/__init__.py b/bsie/extractor/__init__.py
new file mode 100644
index 0000000..ef31343
--- /dev/null
+++ b/bsie/extractor/__init__.py
@@ -0,0 +1,15 @@
+"""Extractors produce triples from some content.
+
+Each Extractor class is linked to the Reader class whose content it requires.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# exports
+__all__: typing.Sequence[str] = []
+
+## EOF ##
diff --git a/bsie/extractor/generic/__init__.py b/bsie/extractor/generic/__init__.py
new file mode 100644
index 0000000..0cb7e7f
--- /dev/null
+++ b/bsie/extractor/generic/__init__.py
@@ -0,0 +1,16 @@
+"""Generic extractors focus on information that is typically available on all
+files. Examples include file system information (file name and size, mime type,
+etc.) and information that is independent of the actual file (constant triples,
+host platform infos, current time, etc.).
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# exports
+__all__: typing.Sequence[str] = []
+
+## EOF ##
diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py
new file mode 100644
index 0000000..11384e6
--- /dev/null
+++ b/bsie/extractor/generic/constant.py
@@ -0,0 +1,57 @@
+"""The Constant extractor produces pre-specified triples.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsie imports
+from bsie.base import extractor
+from bsie.utils import bsfs, node
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Constant',
+ )
+
+
+## code ##
+
+class Constant(extractor.Extractor):
+ """Extract information from file's path."""
+
+ CONTENT_READER = None
+
+ # predicate/value pairs to be produced.
+ _tuples: typing.Tuple[typing.Tuple[bsfs.schema.Predicate, typing.Any], ...]
+
+ def __init__(
+ self,
+ schema: str,
+ tuples: typing.Iterable[typing.Tuple[bsfs.URI, typing.Any]],
+ ):
+ super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + schema))
+ # NOTE: Raises a KeyError if the predicate is not part of the schema
+ self._tuples = tuple((self.schema.predicate(p_uri), value) for p_uri, value in tuples)
+ # TODO: use schema instance for value checking
+
+ def __eq__(self, other: typing.Any) -> bool:
+ return super().__eq__(other) \
+ and self._tuples == other._tuples
+
+ def __hash__(self) -> int:
+ return hash((super().__hash__(), self._tuples))
+
+ def extract(
+ self,
+ subject: node.Node,
+ content: None,
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ for pred, value in self._tuples:
+ if pred in principals:
+ yield subject, pred, value
+
+## EOF ##
diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py
new file mode 100644
index 0000000..7018e12
--- /dev/null
+++ b/bsie/extractor/generic/path.py
@@ -0,0 +1,74 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import os
+import typing
+
+# bsie imports
+from bsie.base import extractor
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Path',
+ )
+
+
+## code ##
+
+class Path(extractor.Extractor):
+ """Extract information from file's path."""
+
+ CONTENT_READER = 'bsie.reader.path.Path'
+
+ # mapping from predicate to handler function.
+ _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]]
+
+ def __init__(self):
+ super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:filename rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range xsd:string ;
+ rdfs:label "File name"^^xsd:string ;
+ schema:description "Filename of entity in some filesystem."^^xsd:string ;
+ bsfs:unique "false"^^xsd:boolean .
+ '''))
+ self._callmap = {
+ self.schema.predicate(ns.bse.filename): self.__filename,
+ }
+
+ def extract(
+ self,
+ subject: node.Node,
+ content: str,
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ for pred in principals:
+ # find callback
+ clbk = self._callmap.get(pred)
+ if clbk is None:
+ continue
+ # get value
+ value = clbk(content)
+ if value is None:
+ continue
+ # produce triple
+ yield subject, pred, value
+
+ def __filename(self, path: str) -> typing.Optional[str]:
+ try:
+ return os.path.basename(path)
+ except Exception: # pylint: disable=broad-except # we explicitly want to catch everything
+ # some error, skip
+ # FIXME: some kind of error reporting (e.g. logging)?
+ # Options: (a) Fail silently (current); (b) Skip and report to log;
+ # (c) Raise ExtractorError (aborts extraction); (d) separate content type
+ # checks from basename errors (report content type errors, skip basename
+ # errors)
+ return None
+
+## EOF ##
diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py
new file mode 100644
index 0000000..0b9ce29
--- /dev/null
+++ b/bsie/extractor/generic/stat.py
@@ -0,0 +1,70 @@
+"""Extract information from the file system, such as filesize.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import os
+import typing
+
+# bsie imports
+from bsie.base import extractor
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Stat',
+ )
+
+
+## code ##
+
+class Stat(extractor.Extractor):
+ """Extract information from the file system."""
+
+ CONTENT_READER = 'bsie.reader.stat.Stat'
+
+ # mapping from predicate to handler function.
+ _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[os.stat_result], typing.Any]]
+
+ def __init__(self):
+ super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:filesize rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range xsd:integer ;
+ rdfs:label "File size"^^xsd:string ;
+ schema:description "File size of entity in some filesystem."^^xsd:string ;
+ bsfs:unique "false"^^xsd:boolean .
+ '''))
+ self._callmap = {
+ self.schema.predicate(ns.bse.filesize): self.__filesize,
+ }
+
+ def extract(
+ self,
+ subject: node.Node,
+ content: os.stat_result,
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ for pred in principals:
+ # find callback
+ clbk = self._callmap.get(pred)
+ if clbk is None:
+ continue
+ # get value
+ value = clbk(content)
+ if value is None:
+ continue
+ # produce triple
+ yield subject, pred, value
+
+ def __filesize(self, content: os.stat_result) -> typing.Optional[int]:
+ """Return the file size."""
+ try:
+ return content.st_size
+ except Exception: # pylint: disable=broad-except # we explicitly want to catch everything
+ # FIXME: some kind of error reporting (e.g. logging)
+ return None
+
+## EOF ##