From 2da348c638ac5058d5acf09ab5df323ee04503d5 Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Mon, 31 Oct 2022 14:14:42 +0100 Subject: constant, filesize, and filename extractors --- bsie/base/extractor.py | 3 +- bsie/extractor/__init__.py | 15 +++++++ bsie/extractor/generic/__init__.py | 16 ++++++++ bsie/extractor/generic/constant.py | 52 ++++++++++++++++++++++++ bsie/extractor/generic/path.py | 70 ++++++++++++++++++++++++++++++++ bsie/extractor/generic/stat.py | 71 +++++++++++++++++++++++++++++++++ bsie/utils/__init__.py | 2 + bsie/utils/bsfs.py | 5 ++- bsie/utils/namespaces.py | 25 ++++++++++++ test/__init__.py | 0 test/extractor/__init__.py | 0 test/extractor/generic/__init__.py | 0 test/extractor/generic/test_constant.py | 63 +++++++++++++++++++++++++++++ test/extractor/generic/test_path.py | 45 +++++++++++++++++++++ test/extractor/generic/test_stat.py | 43 ++++++++++++++++++++ test/reader/__init__.py | 0 16 files changed, 406 insertions(+), 4 deletions(-) create mode 100644 bsie/extractor/__init__.py create mode 100644 bsie/extractor/generic/__init__.py create mode 100644 bsie/extractor/generic/constant.py create mode 100644 bsie/extractor/generic/path.py create mode 100644 bsie/extractor/generic/stat.py create mode 100644 bsie/utils/namespaces.py create mode 100644 test/__init__.py create mode 100644 test/extractor/__init__.py create mode 100644 test/extractor/generic/__init__.py create mode 100644 test/extractor/generic/test_constant.py create mode 100644 test/extractor/generic/test_path.py create mode 100644 test/extractor/generic/test_stat.py create mode 100644 test/reader/__init__.py diff --git a/bsie/base/extractor.py b/bsie/base/extractor.py index d5b0922..ea43925 100644 --- a/bsie/base/extractor.py +++ b/bsie/base/extractor.py @@ -6,7 +6,6 @@ Author: Matthias Baumgartner, 2022 """ # imports import abc -import collections import typing # inner-module imports @@ -22,7 +21,7 @@ __all__: typing.Sequence[str] = ( ## code ## -class Extractor(abc.ABC, collections.abc.Iterable, collections.abc.Callable): +class Extractor(abc.ABC): """Produce (node, predicate, value)-triples from some content.""" # what type of content is expected (i.e. reader subclass). diff --git a/bsie/extractor/__init__.py b/bsie/extractor/__init__.py new file mode 100644 index 0000000..ef31343 --- /dev/null +++ b/bsie/extractor/__init__.py @@ -0,0 +1,15 @@ +"""Extractors produce triples from some content. + +Each Extractor class is linked to the Reader class whose content it requires. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# exports +__all__: typing.Sequence[str] = [] + +## EOF ## diff --git a/bsie/extractor/generic/__init__.py b/bsie/extractor/generic/__init__.py new file mode 100644 index 0000000..0cb7e7f --- /dev/null +++ b/bsie/extractor/generic/__init__.py @@ -0,0 +1,16 @@ +"""Generic extractors focus on information that is typically available on all +files. Examples include file system information (file name and size, mime type, +etc.) and information that is independent of the actual file (constant triples, +host platform infos, current time, etc.). + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# exports +__all__: typing.Sequence[str] = [] + +## EOF ## diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py new file mode 100644 index 0000000..e243131 --- /dev/null +++ b/bsie/extractor/generic/constant.py @@ -0,0 +1,52 @@ +"""The Constant extractor produces pre-specified triples. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from bsie.base import extractor +from bsie.utils.bsfs import URI +from bsie.utils.node import Node + +# exports +__all__: typing.Sequence[str] = ( + 'Constant', + ) + + +## code ## + +class Constant(extractor.Extractor): + """Extract information from file's path.""" + + CONTENT_READER = None + + def __init__( + self, + schema: str, + tuples: typing.Iterable[typing.Tuple[URI, typing.Any]], + ): + self._schema = schema + self._tuples = tuples + # FIXME: use schema instance for predicate checking + #self._tuples = [(pred, value) for pred, value in tuples if pred in schema] + # FIXME: use schema instance for value checking + + def schema(self) -> str: + return self._schema + + def extract( + self, + subject: Node, + content: None, + predicates: typing.Iterable[URI], + ) -> typing.Iterator[typing.Tuple[Node, URI, typing.Any]]: + for pred, value in self._tuples: + if pred in predicates: + yield subject, pred, value + +## EOF ## diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py new file mode 100644 index 0000000..c39bbd2 --- /dev/null +++ b/bsie/extractor/generic/path.py @@ -0,0 +1,70 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import os +import typing + +# inner-module imports +from bsie.base import extractor +from bsie.utils import node, ns +from bsie.utils.bsfs import URI +import bsie.reader.path + +# exports +__all__: typing.Sequence[str] = ( + 'Path', + ) + + +## code ## + +class Path(extractor.Extractor): + """Extract information from file's path.""" + + CONTENT_READER = bsie.reader.path.Path + + def __init__(self): + self.__callmap = { + ns.bse.filename: self.__filename, + } + + def schema(self) -> str: + return ''' + bse:filename a bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + rdf:label "File name"^^xsd:string ; + schema:description "Filename of entity in some filesystem."^^xsd:string ; + owl:maxCardinality "INF"^^xsd:number . + ''' + + def extract( + self, + subject: node.Node, + content: CONTENT_READER.CONTENT_TYPE, + predicates: typing.Iterable[URI], + ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]: + for pred in predicates: + # find callback + clbk = self.__callmap.get(pred) + if clbk is None: + continue + # get value + value = clbk(content) + if value is None: + continue + # produce triple + yield subject, pred, value + + def __filename(self, path: str) -> str: + try: + return os.path.basename(path) + except Exception: + # FIXME: some kind of error reporting (e.g. logging) + return None + +## EOF ## diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py new file mode 100644 index 0000000..d74369c --- /dev/null +++ b/bsie/extractor/generic/stat.py @@ -0,0 +1,71 @@ +"""Extract information from the file system, such as filesize. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from bsie.base import extractor +from bsie.utils import node, ns +from bsie.utils.bsfs import URI +import bsie.reader.stat + + +# exports +__all__: typing.Sequence[str] = ( + 'Stat', + ) + + +## code ## + +class Stat(extractor.Extractor): + """Extract information from the file system.""" + + CONTENT_READER = bsie.reader.stat.Stat + + def __init__(self): + self.__callmap = { + ns.bse.filesize: self.__filesize, + } + + def schema(self) -> str: + return ''' + bse:filesize a bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:integer ; + rdf:label "File size"^^xsd:string ; + schema:description "File size of entity in some filesystem."^^xsd:string ; + owl:maxCardinality "INF"^^xsd:number . + ''' + + def extract( + self, + subject: node.Node, + content: CONTENT_READER.CONTENT_TYPE, + predicates: typing.Iterable[URI], + ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]: + for pred in predicates: + # find callback + clbk = self.__callmap.get(pred) + if clbk is None: + continue + # get value + value = clbk(content) + if value is None: + continue + # produce triple + yield subject, pred, value + + def __filesize(self, content: CONTENT_READER.CONTENT_TYPE) -> int: + """Return the file size.""" + try: + return content.st_size + except Exception: + # FIXME: some kind of error reporting (e.g. logging) + return None + +## EOF ## diff --git a/bsie/utils/__init__.py b/bsie/utils/__init__.py index 1137187..bd22236 100644 --- a/bsie/utils/__init__.py +++ b/bsie/utils/__init__.py @@ -9,12 +9,14 @@ import typing # inner-module imports from . import bsfs +from . import namespaces as ns from . import node # exports __all__: typing.Sequence[str] = ( 'bsfs', 'node', + 'ns', ) ## EOF ## diff --git a/bsie/utils/bsfs.py b/bsie/utils/bsfs.py index 33eb178..1ae657c 100644 --- a/bsie/utils/bsfs.py +++ b/bsie/utils/bsfs.py @@ -8,11 +8,12 @@ Author: Matthias Baumgartner, 2022 import typing # bsfs imports -from bsfs.utils import URI -from bsfs.utils import typename +from bsfs.namespace import Namespace +from bsfs.utils import URI, typename # exports __all__: typing.Sequence[str] = ( + 'Namespace', 'URI', 'typename', ) diff --git a/bsie/utils/namespaces.py b/bsie/utils/namespaces.py new file mode 100644 index 0000000..67ccc71 --- /dev/null +++ b/bsie/utils/namespaces.py @@ -0,0 +1,25 @@ +"""Default namespaces used throughout BSIE. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# bsie imports +from . import bsfs as _bsfs + +# constants +bse = _bsfs.Namespace('http://bsfs.ai/schema/Entity#') +bsfs = _bsfs.Namespace('http://bsfs.ai/schema/') +bsm = _bsfs.Namespace('http://bsfs.ai/schema/meta#') + +# export +__all__: typing.Sequence[str] = ( + 'bse', + 'bsfs', + 'bsm', + ) + +## EOF ## diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/extractor/__init__.py b/test/extractor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/extractor/generic/__init__.py b/test/extractor/generic/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/extractor/generic/test_constant.py b/test/extractor/generic/test_constant.py new file mode 100644 index 0000000..f3ab0a3 --- /dev/null +++ b/test/extractor/generic/test_constant.py @@ -0,0 +1,63 @@ +""" + +Part of the bsie test suite. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import unittest + +# bsie imports +from bsie.utils import ns +from bsie.utils.node import Node + +# objects to test +from bsie.extractor.generic.constant import Constant + + +## code ## + +class TestConstant(unittest.TestCase): + def test_extract(self): + schema = ''' + bse:author a bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + owl:maxCardinality "1"^^xsd:number . + + bse:comment a bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + owl:maxCardinality "INF"^^xsd:number . + + ''' + tuples = [ + (ns.bse.author, 'Me, myself, and I'), + (ns.bse.comment, 'the quick brown fox jumps over the lazy dog.'), + ] + node = Node(ns.bsfs.Entity, '') # Blank node + predicates = (ns.bse.author, ns.bse.comment) + ext = Constant(schema, tuples) + # baseline + self.assertSetEqual(set(ext.extract(node, None, predicates)), + {(node, pred, value) for pred, value in tuples}) + # predicates is respected + self.assertSetEqual(set(ext.extract(node, None, (ns.bse.author, ns.bse.foobar))), + {(node, ns.bse.author, 'Me, myself, and I')}) + self.assertSetEqual(set(ext.extract(node, None, (ns.bse.comment, ns.bse.foobar))), + {(node, ns.bse.comment, 'the quick brown fox jumps over the lazy dog.')}) + self.assertSetEqual(set(ext.extract(node, None, (ns.bse.foobar, ns.bse.barfoo))), set()) + + # FIXME: should change! + # for now: no schema compliance + ext = Constant('', tuples) + self.assertSetEqual(set(ext.extract(node, None, predicates)), + {(node, pred, value) for pred, value in tuples}) + + +## main ## + +if __name__ == '__main__': + unittest.main() + +## EOF ## diff --git a/test/extractor/generic/test_path.py b/test/extractor/generic/test_path.py new file mode 100644 index 0000000..8623490 --- /dev/null +++ b/test/extractor/generic/test_path.py @@ -0,0 +1,45 @@ +""" + +Part of the bsie test suite. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import unittest + +# bsie imports +from bsie.utils import ns +from bsie.utils.node import Node + +# objects to test +from bsie.extractor.generic.path import Path + + +## code ## + +class TestPath(unittest.TestCase): + def test_extract(self): + node = Node(ns.bsfs.Entity, '') # Blank node + ext = Path() + + # baseline + self.assertSetEqual(set(ext.extract(node, '/tmp/foo/bar', (ns.bse.filename, ))), + {(node, ns.bse.filename, 'bar')}) + # predicates parameter is respected + self.assertSetEqual(set(ext.extract(node, '/tmp/foo/bar', (ns.bse.filename, ns.bse.foo))), + {(node, ns.bse.filename, 'bar')}) + self.assertSetEqual(set(ext.extract(node, '/tmp/foo/bar', (ns.bse.foo, ))), set()) + # path variations + self.assertSetEqual(set(ext.extract(node, 'bar', (ns.bse.filename, ))), + {(node, ns.bse.filename, 'bar')}) + self.assertSetEqual(set(ext.extract(node, '', (ns.bse.filename, ))), + {(node, ns.bse.filename, '')}) + self.assertSetEqual(set(ext.extract(node, None, (ns.bse.filename, ))), set()) + + +## main ## + +if __name__ == '__main__': + unittest.main() + +## EOF ## diff --git a/test/extractor/generic/test_stat.py b/test/extractor/generic/test_stat.py new file mode 100644 index 0000000..f89b053 --- /dev/null +++ b/test/extractor/generic/test_stat.py @@ -0,0 +1,43 @@ +""" + +Part of the bsie test suite. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import os +import unittest + +# bsie imports +from bsie.utils import ns +from bsie.utils.node import Node + +# objects to test +from bsie.extractor.generic.stat import Stat + + +## code ## + +class TestConstant(unittest.TestCase): + def test_extract(self): + node = Node(ns.bsfs.Entity, '') # Blank node + content = os.stat(__file__) + ext = Stat() + + # baseline + self.assertSetEqual(set(ext.extract(node, content, (ns.bse.filesize, ))), + {(node, ns.bse.filesize, content.st_size)}) + # predicates parameter is respected + self.assertSetEqual(set(ext.extract(node, content, (ns.bse.filesize, ns.bse.foo))), + {(node, ns.bse.filesize, content.st_size)}) + self.assertSetEqual(set(ext.extract(node, content, (ns.bse.foo, ))), set()) + # content variations + self.assertSetEqual(set(ext.extract(node, None, (ns.bse.filesize, ))), set()) + + +## main ## + +if __name__ == '__main__': + unittest.main() + +## EOF ## diff --git a/test/reader/__init__.py b/test/reader/__init__.py new file mode 100644 index 0000000..e69de29 -- cgit v1.2.3