diff options
Diffstat (limited to 'bsie/extractor')
-rw-r--r-- | bsie/extractor/__init__.py | 15 | ||||
-rw-r--r-- | bsie/extractor/generic/__init__.py | 16 | ||||
-rw-r--r-- | bsie/extractor/generic/constant.py | 57 | ||||
-rw-r--r-- | bsie/extractor/generic/path.py | 74 | ||||
-rw-r--r-- | bsie/extractor/generic/stat.py | 70 |
5 files changed, 232 insertions, 0 deletions
diff --git a/bsie/extractor/__init__.py b/bsie/extractor/__init__.py new file mode 100644 index 0000000..ef31343 --- /dev/null +++ b/bsie/extractor/__init__.py @@ -0,0 +1,15 @@ +"""Extractors produce triples from some content. + +Each Extractor class is linked to the Reader class whose content it requires. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# exports +__all__: typing.Sequence[str] = [] + +## EOF ## diff --git a/bsie/extractor/generic/__init__.py b/bsie/extractor/generic/__init__.py new file mode 100644 index 0000000..0cb7e7f --- /dev/null +++ b/bsie/extractor/generic/__init__.py @@ -0,0 +1,16 @@ +"""Generic extractors focus on information that is typically available on all +files. Examples include file system information (file name and size, mime type, +etc.) and information that is independent of the actual file (constant triples, +host platform infos, current time, etc.). + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# exports +__all__: typing.Sequence[str] = [] + +## EOF ## diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py new file mode 100644 index 0000000..11384e6 --- /dev/null +++ b/bsie/extractor/generic/constant.py @@ -0,0 +1,57 @@ +"""The Constant extractor produces pre-specified triples. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# bsie imports +from bsie.base import extractor +from bsie.utils import bsfs, node + +# exports +__all__: typing.Sequence[str] = ( + 'Constant', + ) + + +## code ## + +class Constant(extractor.Extractor): + """Extract information from file's path.""" + + CONTENT_READER = None + + # predicate/value pairs to be produced. + _tuples: typing.Tuple[typing.Tuple[bsfs.schema.Predicate, typing.Any], ...] + + def __init__( + self, + schema: str, + tuples: typing.Iterable[typing.Tuple[bsfs.URI, typing.Any]], + ): + super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + schema)) + # NOTE: Raises a KeyError if the predicate is not part of the schema + self._tuples = tuple((self.schema.predicate(p_uri), value) for p_uri, value in tuples) + # TODO: use schema instance for value checking + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) \ + and self._tuples == other._tuples + + def __hash__(self) -> int: + return hash((super().__hash__(), self._tuples)) + + def extract( + self, + subject: node.Node, + content: None, + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + for pred, value in self._tuples: + if pred in principals: + yield subject, pred, value + +## EOF ## diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py new file mode 100644 index 0000000..7018e12 --- /dev/null +++ b/bsie/extractor/generic/path.py @@ -0,0 +1,74 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import os +import typing + +# bsie imports +from bsie.base import extractor +from bsie.utils import bsfs, node, ns + +# exports +__all__: typing.Sequence[str] = ( + 'Path', + ) + + +## code ## + +class Path(extractor.Extractor): + """Extract information from file's path.""" + + CONTENT_READER = 'bsie.reader.path.Path' + + # mapping from predicate to handler function. + _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]] + + def __init__(self): + super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' + bse:filename rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:File ; + rdfs:range xsd:string ; + rdfs:label "File name"^^xsd:string ; + schema:description "Filename of entity in some filesystem."^^xsd:string ; + bsfs:unique "false"^^xsd:boolean . + ''')) + self._callmap = { + self.schema.predicate(ns.bse.filename): self.__filename, + } + + def extract( + self, + subject: node.Node, + content: str, + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + for pred in principals: + # find callback + clbk = self._callmap.get(pred) + if clbk is None: + continue + # get value + value = clbk(content) + if value is None: + continue + # produce triple + yield subject, pred, value + + def __filename(self, path: str) -> typing.Optional[str]: + try: + return os.path.basename(path) + except Exception: # pylint: disable=broad-except # we explicitly want to catch everything + # some error, skip + # FIXME: some kind of error reporting (e.g. logging)? + # Options: (a) Fail silently (current); (b) Skip and report to log; + # (c) Raise ExtractorError (aborts extraction); (d) separate content type + # checks from basename errors (report content type errors, skip basename + # errors) + return None + +## EOF ## diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py new file mode 100644 index 0000000..0b9ce29 --- /dev/null +++ b/bsie/extractor/generic/stat.py @@ -0,0 +1,70 @@ +"""Extract information from the file system, such as filesize. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import os +import typing + +# bsie imports +from bsie.base import extractor +from bsie.utils import bsfs, node, ns + +# exports +__all__: typing.Sequence[str] = ( + 'Stat', + ) + + +## code ## + +class Stat(extractor.Extractor): + """Extract information from the file system.""" + + CONTENT_READER = 'bsie.reader.stat.Stat' + + # mapping from predicate to handler function. + _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[os.stat_result], typing.Any]] + + def __init__(self): + super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' + bse:filesize rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:File ; + rdfs:range xsd:integer ; + rdfs:label "File size"^^xsd:string ; + schema:description "File size of entity in some filesystem."^^xsd:string ; + bsfs:unique "false"^^xsd:boolean . + ''')) + self._callmap = { + self.schema.predicate(ns.bse.filesize): self.__filesize, + } + + def extract( + self, + subject: node.Node, + content: os.stat_result, + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + for pred in principals: + # find callback + clbk = self._callmap.get(pred) + if clbk is None: + continue + # get value + value = clbk(content) + if value is None: + continue + # produce triple + yield subject, pred, value + + def __filesize(self, content: os.stat_result) -> typing.Optional[int]: + """Return the file size.""" + try: + return content.st_size + except Exception: # pylint: disable=broad-except # we explicitly want to catch everything + # FIXME: some kind of error reporting (e.g. logging) + return None + +## EOF ## |