aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatthias Baumgartner <dev@igsor.net>2022-10-31 14:14:57 +0100
committerMatthias Baumgartner <dev@igsor.net>2022-10-31 14:14:57 +0100
commit9389c741bdbbca9adbff6099d440706cd63deac4 (patch)
tree48ee0e912e2f19f51bd684d790f0bcc2d906e887
parentd4023fa972af379a4235f51783954671de974372 (diff)
parent2da348c638ac5058d5acf09ab5df323ee04503d5 (diff)
downloadbsie-9389c741bdbbca9adbff6099d440706cd63deac4.tar.gz
bsie-9389c741bdbbca9adbff6099d440706cd63deac4.tar.bz2
bsie-9389c741bdbbca9adbff6099d440706cd63deac4.zip
Merge branch 'mb/extractors' into develop
-rw-r--r--bsie/base/extractor.py3
-rw-r--r--bsie/extractor/__init__.py15
-rw-r--r--bsie/extractor/generic/__init__.py16
-rw-r--r--bsie/extractor/generic/constant.py52
-rw-r--r--bsie/extractor/generic/path.py70
-rw-r--r--bsie/extractor/generic/stat.py71
-rw-r--r--bsie/utils/__init__.py2
-rw-r--r--bsie/utils/bsfs.py5
-rw-r--r--bsie/utils/namespaces.py25
-rw-r--r--test/__init__.py0
-rw-r--r--test/extractor/__init__.py0
-rw-r--r--test/extractor/generic/__init__.py0
-rw-r--r--test/extractor/generic/test_constant.py63
-rw-r--r--test/extractor/generic/test_path.py45
-rw-r--r--test/extractor/generic/test_stat.py43
-rw-r--r--test/reader/__init__.py0
16 files changed, 406 insertions, 4 deletions
diff --git a/bsie/base/extractor.py b/bsie/base/extractor.py
index d5b0922..ea43925 100644
--- a/bsie/base/extractor.py
+++ b/bsie/base/extractor.py
@@ -6,7 +6,6 @@ Author: Matthias Baumgartner, 2022
"""
# imports
import abc
-import collections
import typing
# inner-module imports
@@ -22,7 +21,7 @@ __all__: typing.Sequence[str] = (
## code ##
-class Extractor(abc.ABC, collections.abc.Iterable, collections.abc.Callable):
+class Extractor(abc.ABC):
"""Produce (node, predicate, value)-triples from some content."""
# what type of content is expected (i.e. reader subclass).
diff --git a/bsie/extractor/__init__.py b/bsie/extractor/__init__.py
new file mode 100644
index 0000000..ef31343
--- /dev/null
+++ b/bsie/extractor/__init__.py
@@ -0,0 +1,15 @@
+"""Extractors produce triples from some content.
+
+Each Extractor class is linked to the Reader class whose content it requires.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# exports
+__all__: typing.Sequence[str] = []
+
+## EOF ##
diff --git a/bsie/extractor/generic/__init__.py b/bsie/extractor/generic/__init__.py
new file mode 100644
index 0000000..0cb7e7f
--- /dev/null
+++ b/bsie/extractor/generic/__init__.py
@@ -0,0 +1,16 @@
+"""Generic extractors focus on information that is typically available on all
+files. Examples include file system information (file name and size, mime type,
+etc.) and information that is independent of the actual file (constant triples,
+host platform infos, current time, etc.).
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# exports
+__all__: typing.Sequence[str] = []
+
+## EOF ##
diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py
new file mode 100644
index 0000000..e243131
--- /dev/null
+++ b/bsie/extractor/generic/constant.py
@@ -0,0 +1,52 @@
+"""The Constant extractor produces pre-specified triples.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from bsie.base import extractor
+from bsie.utils.bsfs import URI
+from bsie.utils.node import Node
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Constant',
+ )
+
+
+## code ##
+
+class Constant(extractor.Extractor):
+ """Extract information from file's path."""
+
+ CONTENT_READER = None
+
+ def __init__(
+ self,
+ schema: str,
+ tuples: typing.Iterable[typing.Tuple[URI, typing.Any]],
+ ):
+ self._schema = schema
+ self._tuples = tuples
+ # FIXME: use schema instance for predicate checking
+ #self._tuples = [(pred, value) for pred, value in tuples if pred in schema]
+ # FIXME: use schema instance for value checking
+
+ def schema(self) -> str:
+ return self._schema
+
+ def extract(
+ self,
+ subject: Node,
+ content: None,
+ predicates: typing.Iterable[URI],
+ ) -> typing.Iterator[typing.Tuple[Node, URI, typing.Any]]:
+ for pred, value in self._tuples:
+ if pred in predicates:
+ yield subject, pred, value
+
+## EOF ##
diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py
new file mode 100644
index 0000000..c39bbd2
--- /dev/null
+++ b/bsie/extractor/generic/path.py
@@ -0,0 +1,70 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import os
+import typing
+
+# inner-module imports
+from bsie.base import extractor
+from bsie.utils import node, ns
+from bsie.utils.bsfs import URI
+import bsie.reader.path
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Path',
+ )
+
+
+## code ##
+
+class Path(extractor.Extractor):
+ """Extract information from file's path."""
+
+ CONTENT_READER = bsie.reader.path.Path
+
+ def __init__(self):
+ self.__callmap = {
+ ns.bse.filename: self.__filename,
+ }
+
+ def schema(self) -> str:
+ return '''
+ bse:filename a bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ rdf:label "File name"^^xsd:string ;
+ schema:description "Filename of entity in some filesystem."^^xsd:string ;
+ owl:maxCardinality "INF"^^xsd:number .
+ '''
+
+ def extract(
+ self,
+ subject: node.Node,
+ content: CONTENT_READER.CONTENT_TYPE,
+ predicates: typing.Iterable[URI],
+ ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]:
+ for pred in predicates:
+ # find callback
+ clbk = self.__callmap.get(pred)
+ if clbk is None:
+ continue
+ # get value
+ value = clbk(content)
+ if value is None:
+ continue
+ # produce triple
+ yield subject, pred, value
+
+ def __filename(self, path: str) -> str:
+ try:
+ return os.path.basename(path)
+ except Exception:
+ # FIXME: some kind of error reporting (e.g. logging)
+ return None
+
+## EOF ##
diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py
new file mode 100644
index 0000000..d74369c
--- /dev/null
+++ b/bsie/extractor/generic/stat.py
@@ -0,0 +1,71 @@
+"""Extract information from the file system, such as filesize.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from bsie.base import extractor
+from bsie.utils import node, ns
+from bsie.utils.bsfs import URI
+import bsie.reader.stat
+
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Stat',
+ )
+
+
+## code ##
+
+class Stat(extractor.Extractor):
+ """Extract information from the file system."""
+
+ CONTENT_READER = bsie.reader.stat.Stat
+
+ def __init__(self):
+ self.__callmap = {
+ ns.bse.filesize: self.__filesize,
+ }
+
+ def schema(self) -> str:
+ return '''
+ bse:filesize a bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:integer ;
+ rdf:label "File size"^^xsd:string ;
+ schema:description "File size of entity in some filesystem."^^xsd:string ;
+ owl:maxCardinality "INF"^^xsd:number .
+ '''
+
+ def extract(
+ self,
+ subject: node.Node,
+ content: CONTENT_READER.CONTENT_TYPE,
+ predicates: typing.Iterable[URI],
+ ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]:
+ for pred in predicates:
+ # find callback
+ clbk = self.__callmap.get(pred)
+ if clbk is None:
+ continue
+ # get value
+ value = clbk(content)
+ if value is None:
+ continue
+ # produce triple
+ yield subject, pred, value
+
+ def __filesize(self, content: CONTENT_READER.CONTENT_TYPE) -> int:
+ """Return the file size."""
+ try:
+ return content.st_size
+ except Exception:
+ # FIXME: some kind of error reporting (e.g. logging)
+ return None
+
+## EOF ##
diff --git a/bsie/utils/__init__.py b/bsie/utils/__init__.py
index 1137187..bd22236 100644
--- a/bsie/utils/__init__.py
+++ b/bsie/utils/__init__.py
@@ -9,12 +9,14 @@ import typing
# inner-module imports
from . import bsfs
+from . import namespaces as ns
from . import node
# exports
__all__: typing.Sequence[str] = (
'bsfs',
'node',
+ 'ns',
)
## EOF ##
diff --git a/bsie/utils/bsfs.py b/bsie/utils/bsfs.py
index 33eb178..1ae657c 100644
--- a/bsie/utils/bsfs.py
+++ b/bsie/utils/bsfs.py
@@ -8,11 +8,12 @@ Author: Matthias Baumgartner, 2022
import typing
# bsfs imports
-from bsfs.utils import URI
-from bsfs.utils import typename
+from bsfs.namespace import Namespace
+from bsfs.utils import URI, typename
# exports
__all__: typing.Sequence[str] = (
+ 'Namespace',
'URI',
'typename',
)
diff --git a/bsie/utils/namespaces.py b/bsie/utils/namespaces.py
new file mode 100644
index 0000000..67ccc71
--- /dev/null
+++ b/bsie/utils/namespaces.py
@@ -0,0 +1,25 @@
+"""Default namespaces used throughout BSIE.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsie imports
+from . import bsfs as _bsfs
+
+# constants
+bse = _bsfs.Namespace('http://bsfs.ai/schema/Entity#')
+bsfs = _bsfs.Namespace('http://bsfs.ai/schema/')
+bsm = _bsfs.Namespace('http://bsfs.ai/schema/meta#')
+
+# export
+__all__: typing.Sequence[str] = (
+ 'bse',
+ 'bsfs',
+ 'bsm',
+ )
+
+## EOF ##
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/__init__.py
diff --git a/test/extractor/__init__.py b/test/extractor/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/extractor/__init__.py
diff --git a/test/extractor/generic/__init__.py b/test/extractor/generic/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/extractor/generic/__init__.py
diff --git a/test/extractor/generic/test_constant.py b/test/extractor/generic/test_constant.py
new file mode 100644
index 0000000..f3ab0a3
--- /dev/null
+++ b/test/extractor/generic/test_constant.py
@@ -0,0 +1,63 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import unittest
+
+# bsie imports
+from bsie.utils import ns
+from bsie.utils.node import Node
+
+# objects to test
+from bsie.extractor.generic.constant import Constant
+
+
+## code ##
+
+class TestConstant(unittest.TestCase):
+ def test_extract(self):
+ schema = '''
+ bse:author a bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ owl:maxCardinality "1"^^xsd:number .
+
+ bse:comment a bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ owl:maxCardinality "INF"^^xsd:number .
+
+ '''
+ tuples = [
+ (ns.bse.author, 'Me, myself, and I'),
+ (ns.bse.comment, 'the quick brown fox jumps over the lazy dog.'),
+ ]
+ node = Node(ns.bsfs.Entity, '') # Blank node
+ predicates = (ns.bse.author, ns.bse.comment)
+ ext = Constant(schema, tuples)
+ # baseline
+ self.assertSetEqual(set(ext.extract(node, None, predicates)),
+ {(node, pred, value) for pred, value in tuples})
+ # predicates is respected
+ self.assertSetEqual(set(ext.extract(node, None, (ns.bse.author, ns.bse.foobar))),
+ {(node, ns.bse.author, 'Me, myself, and I')})
+ self.assertSetEqual(set(ext.extract(node, None, (ns.bse.comment, ns.bse.foobar))),
+ {(node, ns.bse.comment, 'the quick brown fox jumps over the lazy dog.')})
+ self.assertSetEqual(set(ext.extract(node, None, (ns.bse.foobar, ns.bse.barfoo))), set())
+
+ # FIXME: should change!
+ # for now: no schema compliance
+ ext = Constant('', tuples)
+ self.assertSetEqual(set(ext.extract(node, None, predicates)),
+ {(node, pred, value) for pred, value in tuples})
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/extractor/generic/test_path.py b/test/extractor/generic/test_path.py
new file mode 100644
index 0000000..8623490
--- /dev/null
+++ b/test/extractor/generic/test_path.py
@@ -0,0 +1,45 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import unittest
+
+# bsie imports
+from bsie.utils import ns
+from bsie.utils.node import Node
+
+# objects to test
+from bsie.extractor.generic.path import Path
+
+
+## code ##
+
+class TestPath(unittest.TestCase):
+ def test_extract(self):
+ node = Node(ns.bsfs.Entity, '') # Blank node
+ ext = Path()
+
+ # baseline
+ self.assertSetEqual(set(ext.extract(node, '/tmp/foo/bar', (ns.bse.filename, ))),
+ {(node, ns.bse.filename, 'bar')})
+ # predicates parameter is respected
+ self.assertSetEqual(set(ext.extract(node, '/tmp/foo/bar', (ns.bse.filename, ns.bse.foo))),
+ {(node, ns.bse.filename, 'bar')})
+ self.assertSetEqual(set(ext.extract(node, '/tmp/foo/bar', (ns.bse.foo, ))), set())
+ # path variations
+ self.assertSetEqual(set(ext.extract(node, 'bar', (ns.bse.filename, ))),
+ {(node, ns.bse.filename, 'bar')})
+ self.assertSetEqual(set(ext.extract(node, '', (ns.bse.filename, ))),
+ {(node, ns.bse.filename, '')})
+ self.assertSetEqual(set(ext.extract(node, None, (ns.bse.filename, ))), set())
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/extractor/generic/test_stat.py b/test/extractor/generic/test_stat.py
new file mode 100644
index 0000000..f89b053
--- /dev/null
+++ b/test/extractor/generic/test_stat.py
@@ -0,0 +1,43 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import os
+import unittest
+
+# bsie imports
+from bsie.utils import ns
+from bsie.utils.node import Node
+
+# objects to test
+from bsie.extractor.generic.stat import Stat
+
+
+## code ##
+
+class TestConstant(unittest.TestCase):
+ def test_extract(self):
+ node = Node(ns.bsfs.Entity, '') # Blank node
+ content = os.stat(__file__)
+ ext = Stat()
+
+ # baseline
+ self.assertSetEqual(set(ext.extract(node, content, (ns.bse.filesize, ))),
+ {(node, ns.bse.filesize, content.st_size)})
+ # predicates parameter is respected
+ self.assertSetEqual(set(ext.extract(node, content, (ns.bse.filesize, ns.bse.foo))),
+ {(node, ns.bse.filesize, content.st_size)})
+ self.assertSetEqual(set(ext.extract(node, content, (ns.bse.foo, ))), set())
+ # content variations
+ self.assertSetEqual(set(ext.extract(node, None, (ns.bse.filesize, ))), set())
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/reader/__init__.py b/test/reader/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/reader/__init__.py