Merge branch 'develop' into main

author: Matthias Baumgartner <dev@igsor.net> 2022-12-18 14:22:31 +0100
committer: Matthias Baumgartner <dev@igsor.net> 2022-12-18 14:22:31 +0100
commit: 7582c280ad5324a2f0427999911c7e7abc14a6ab (patch)
tree: 0a59bbfe1c44d3497daad9f25ff9e7eb2bf9eb82 /bsie
parent: cb49e4567a18de6851286ff672e54f9a91865fe9 (diff)
parent: 057e09d6537bf5c39815661a75819081e3e5fda7 (diff)
download: bsie-7582c280ad5324a2f0427999911c7e7abc14a6ab.tar.gz
bsie-7582c280ad5324a2f0427999911c7e7abc14a6ab.tar.bz2
bsie-7582c280ad5324a2f0427999911c7e7abc14a6ab.zip
25 files changed, 1389 insertions, 0 deletions
diff --git a/bsie/__init__.py b/bsie/__init__.py
new file mode 100644
index 0000000..8d2308c
--- /dev/null
+++ b/bsie/__init__.py
@@ -0,0 +1,18 @@
+"""The BSIE module extracts triples from files for insertion into a BSFS storage.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import collections
+import typing
+
+# constants
+T_VERSION_INFO = collections.namedtuple('T_VERSION_INFO', ('major', 'minor', 'micro')) # pylint: disable=invalid-name
+version_info = T_VERSION_INFO(0, 0, 1)
+
+# exports
+__all__: typing.Sequence[str] = []
+
+## EOF ##
diff --git a/bsie/apps/__init__.py b/bsie/apps/__init__.py
new file mode 100644
index 0000000..a548c3c
--- /dev/null
+++ b/bsie/apps/__init__.py
@@ -0,0 +1,20 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from .index import main as index
+from .info import main as info
+
+# exports
+__all__: typing.Sequence[str] = (
+    'index',
+    'info',
+    )
+
+## EOF ##
diff --git a/bsie/apps/index.py b/bsie/apps/index.py
new file mode 100644
index 0000000..1dbfdd8
--- /dev/null
+++ b/bsie/apps/index.py
@@ -0,0 +1,121 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import argparse
+import os
+import typing
+
+# bsie imports
+from bsie.base import errors
+from bsie.lib import BSIE
+from bsie.tools import builder
+from bsie.utils import bsfs
+
+# exports
+__all__: typing.Sequence[str] = (
+    'main',
+    )
+
+
+## code ##
+
+def main(argv):
+    """Index files or directories into BSFS."""
+    parser = argparse.ArgumentParser(description=main.__doc__, prog='index')
+    parser.add_argument('--user', type=bsfs.URI, default=bsfs.URI('http://example.com/me'),
+        help='')
+    parser.add_argument('--collect', action='append', default=[],
+        help='')
+    parser.add_argument('--discard', action='append', default=[],
+        help='')
+    parser.add_argument('-r', '--recursive', action='store_true', default=False,
+        help='')
+    parser.add_argument('--follow', action='store_true', default=False,
+        help='')
+    parser.add_argument('--print', action='store_true', default=False,
+        help='')
+    parser.add_argument('input_file', nargs=argparse.REMAINDER,
+        help='')
+    args = parser.parse_args(argv)
+
+    # FIXME: Read reader/extractor configs from a config file
+    # reader builder
+    rbuild = builder.ReaderBuilder({})
+    # extractor builder
+    ebuild = builder.ExtractorBuilder([
+        {'bsie.extractor.generic.path.Path': {}},
+        {'bsie.extractor.generic.stat.Stat': {}},
+        {'bsie.extractor.generic.constant.Constant': dict(
+            tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
+            schema='''
+                bse:author rdfs:subClassOf bsfs:Predicate ;
+                    rdfs:domain bsfs:Entity ;
+                    rdfs:range xsd:string ;
+                    bsfs:unique "true"^^xsd:boolean .
+                ''',
+            )},
+        ])
+    # pipeline builder
+    pbuild = builder.PipelineBuilder(
+        bsfs.Namespace(args.user + ('/' if not args.user.endswith('/') else '')),
+        rbuild,
+        ebuild,
+        )
+
+    # build pipeline
+    pipeline = pbuild.build()
+    # build BSIE frontend
+    bsie = BSIE(pipeline, args.collect, args.discard)
+
+
+    def walk(handle):
+        """Walk through given input files."""
+        # FIXME: collect all triples by node, set all predicates at once
+        # FIXME: simplify code (below but maybe also above)
+        # FIXME: How to handle dependencies between data?
+        #        E.g. do I still want to link to a tag despite not being permitted to set its label?
+        # FIXME: node renaming?
+
+        # index input paths
+        for path in args.input_file:
+            if os.path.isdir(path) and args.recursive:
+                for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow):
+                    for filename in filenames:
+                        for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)):
+                            handle(node, pred, value)
+            elif os.path.isfile(path):
+                for node, pred, value in bsie.from_file(path):
+                    handle(node, pred, value)
+            else:
+                raise errors.UnreachableError()
+
+
+    if args.print:
+        walk(print)
+        return None
+
+    # initialize bsfs
+    # NOTE: With presistent storages, the schema migration will be a seaparte operation.
+    # Here, we'd simply examine the schema and potentially discard more predicates.
+    store = bsfs.Open(bsfs.init_sparql_store(args.user))
+    store.migrate(bsie.schema)
+    # process files
+    def handle(node, pred, value):
+        store.node(node.node_type, node.uri).set(pred.uri, value)
+    walk(handle)
+    # return store
+    return store
+
+
+
+## main ##
+
+if __name__ == '__main__':
+    import sys
+    main(sys.argv[1:])
+
+## EOF ##
diff --git a/bsie/apps/info.py b/bsie/apps/info.py
new file mode 100644
index 0000000..eaf1f71
--- /dev/null
+++ b/bsie/apps/info.py
@@ -0,0 +1,74 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import argparse
+import sys
+import typing
+
+# bsie imports
+from bsie.base import errors
+from bsie.tools import builder
+from bsie.utils import bsfs
+
+# exports
+__all__: typing.Sequence[str] = (
+    'main',
+    )
+
+
+## code ##
+
+def main(argv):
+    """Show information from BSIE."""
+    parser = argparse.ArgumentParser(description=main.__doc__, prog='info')
+    parser.add_argument('what', choices=('predicates', ),
+        help='Select what information to show.')
+    args = parser.parse_args(argv)
+
+    # FIXME: Read reader/extractor configs from a config file
+    # reader builder
+    rbuild = builder.ReaderBuilder({})
+    # extractor builder
+    ebuild = builder.ExtractorBuilder([
+        {'bsie.extractor.generic.path.Path': {}},
+        {'bsie.extractor.generic.stat.Stat': {}},
+        {'bsie.extractor.generic.constant.Constant': dict(
+            tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
+            schema='''
+                bse:author rdfs:subClassOf bsfs:Predicate ;
+                    rdfs:domain bsfs:Entity ;
+                    rdfs:range xsd:string ;
+                    bsfs:unique "true"^^xsd:boolean .
+                ''',
+            )},
+        ])
+    # pipeline builder
+    pbuild = builder.PipelineBuilder(
+        bsfs.Namespace('http://example.com/me/'), # not actually used
+        rbuild,
+        ebuild,
+        )
+
+    # build pipeline
+    pipeline = pbuild.build()
+
+    # show info
+    if args.what == 'predicates':
+        # show predicates
+        for pred in pipeline.schema.predicates():
+            print(pred.uri)
+    else:
+        # args.what is already checked by argparse
+        raise errors.UnreachableError()
+
+
+## main ##
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
+
+## EOF ##
diff --git a/bsie/base/__init__.py b/bsie/base/__init__.py
new file mode 100644
index 0000000..0d362cd
--- /dev/null
+++ b/bsie/base/__init__.py
@@ -0,0 +1,24 @@
+"""The base module defines the BSIE interfaces.
+
+You'll mostly find abstract classes here.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from . import errors
+from .extractor import Extractor
+from .reader import Reader
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Extractor',
+    'Reader',
+    'errors',
+    )
+
+## EOF ##
diff --git a/bsie/base/errors.py b/bsie/base/errors.py
new file mode 100644
index 0000000..dc3c30e
--- /dev/null
+++ b/bsie/base/errors.py
@@ -0,0 +1,42 @@
+"""Common BSIE exceptions.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# exports
+__all__: typing.Sequence[str] = (
+    'BuilderError',
+    'ExtractorError',
+    'LoaderError',
+    'ReaderError',
+    )
+
+
+## code ##
+
+class _BSIEError(Exception):
+    """Generic BSIE error."""
+
+class BuilderError(_BSIEError):
+    """The Builder failed to create an instance."""
+
+class LoaderError(BuilderError):
+    """Failed to load a module or class."""
+
+class ExtractorError(_BSIEError):
+    """The Extractor failed to process the given content."""
+
+class ReaderError(_BSIEError):
+    """The Reader failed to read the given file."""
+
+class ProgrammingError(_BSIEError):
+    """An assertion-like error that indicates a code-base issue."""
+
+class UnreachableError(ProgrammingError):
+    """Bravo, you've reached a point in code that should logically not be reachable."""
+
+## EOF ##
diff --git a/bsie/base/extractor.py b/bsie/base/extractor.py
new file mode 100644
index 0000000..c44021b
--- /dev/null
+++ b/bsie/base/extractor.py
@@ -0,0 +1,103 @@
+"""The Extractor classes transform content into triples.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import abc
+import typing
+
+# bsie imports
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Extractor',
+    )
+
+# constants
+
+# essential definitions typically used in extractor schemas.
+# NOTE: This preamble is only for convenience; Each Extractor must implement its use, if so desired.
+SCHEMA_PREAMBLE = '''
+    # common external prefixes
+    prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+    prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+    prefix schema: <http://schema.org/>
+
+    # common bsfs prefixes
+    prefix bsfs: <http://bsfs.ai/schema/>
+    prefix bse: <http://bsfs.ai/schema/Entity#>
+
+    # essential nodes
+    bsfs:Entity rdfs:subClassOf bsfs:Node .
+    bsfs:File rdfs:subClassOf bsfs:Entity .
+
+    # common definitions
+    xsd:string rdfs:subClassOf bsfs:Literal .
+    xsd:integer rdfs:subClassOf bsfs:Literal .
+
+    '''
+
+
+## code ##
+
+class Extractor(abc.ABC):
+    """Produce (subject, predicate, value)-triples from some content.
+    The Extractor produces princpal predicates that provide information
+    about the content itself (i.e., triples that include the subject),
+    and may also generate triples with auxiliary predicates if the
+    extracted value is a node itself.
+    """
+
+    # what type of content is expected (i.e. reader subclass).
+    CONTENT_READER: typing.Optional[str] = None
+
+    # extractor schema.
+    _schema: bsfs.schema.Schema
+
+    def __init__(self, schema: bsfs.schema.Schema):
+        self._schema = schema
+
+    def __str__(self) -> str:
+        return bsfs.typename(self)
+
+    def __repr__(self) -> str:
+        return f'{bsfs.typename(self)}()'
+
+    def __eq__(self, other: typing.Any) -> bool:
+        return isinstance(other, type(self)) \
+          and self.CONTENT_READER == other.CONTENT_READER \
+          and self.schema == other.schema
+
+    def __hash__(self) -> int:
+        return hash((type(self), self.CONTENT_READER, self.schema))
+
+    @property
+    def schema(self) -> bsfs.schema.Schema:
+        """Return the extractor's schema."""
+        return self._schema
+
+    @property
+    def principals(self) -> typing.Iterator[bsfs.schema.Predicate]:
+        """Return the principal predicates, i.e., relations from/to the extraction subject."""
+        ent = self.schema.node(ns.bsfs.Entity)
+        return (
+            pred
+            for pred
+            in self.schema.predicates()
+            if pred.domain <= ent or (pred.range is not None and pred.range <= ent)
+            )
+
+    @abc.abstractmethod
+    def extract(
+            self,
+            subject: node.Node,
+            content: typing.Any,
+            principals: typing.Iterable[bsfs.schema.Predicate],
+            ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+        """Return (node, predicate, value) triples."""
+
+## EOF ##
diff --git a/bsie/base/reader.py b/bsie/base/reader.py
new file mode 100644
index 0000000..cbabd36
--- /dev/null
+++ b/bsie/base/reader.py
@@ -0,0 +1,47 @@
+"""The Reader classes return high-level content structures from files.
+
+The Reader fulfills two purposes:
+    First, it brokers between multiple libraries and file formats.
+    Second, it separates multiple aspects of a file into distinct content types.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import abc
+import typing
+
+# bsie imports
+from bsie.utils import bsfs
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Reader',
+    )
+
+
+## code ##
+
+class Reader(abc.ABC):
+    """Read and return some content from a file."""
+
+    def __str__(self) -> str:
+        return bsfs.typename(self)
+
+    def __repr__(self) -> str:
+        return f'{bsfs.typename(self)}()'
+
+    def __eq__(self, other: typing.Any) -> bool:
+        return isinstance(other, type(self))
+
+    def __hash__(self) -> int:
+        return hash(type(self))
+
+    @abc.abstractmethod
+    def __call__(self, path: bsfs.URI) -> typing.Any:
+        """Return some content of the file at *path*.
+        Raises a `ReaderError` if the reader cannot make sense of the file format.
+        """
+
+## EOF ##
diff --git a/bsie/extractor/__init__.py b/bsie/extractor/__init__.py
new file mode 100644
index 0000000..ef31343
--- /dev/null
+++ b/bsie/extractor/__init__.py
@@ -0,0 +1,15 @@
+"""Extractors produce triples from some content.
+
+Each Extractor class is linked to the Reader class whose content it requires.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# exports
+__all__: typing.Sequence[str] = []
+
+## EOF ##
diff --git a/bsie/extractor/generic/__init__.py b/bsie/extractor/generic/__init__.py
new file mode 100644
index 0000000..0cb7e7f
--- /dev/null
+++ b/bsie/extractor/generic/__init__.py
@@ -0,0 +1,16 @@
+"""Generic extractors focus on information that is typically available on all
+files. Examples include file system information (file name and size, mime type,
+etc.) and information that is independent of the actual file (constant triples,
+host platform infos, current time, etc.).
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# exports
+__all__: typing.Sequence[str] = []
+
+## EOF ##
diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py
new file mode 100644
index 0000000..11384e6
--- /dev/null
+++ b/bsie/extractor/generic/constant.py
@@ -0,0 +1,57 @@
+"""The Constant extractor produces pre-specified triples.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsie imports
+from bsie.base import extractor
+from bsie.utils import bsfs, node
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Constant',
+    )
+
+
+## code ##
+
+class Constant(extractor.Extractor):
+    """Extract information from file's path."""
+
+    CONTENT_READER = None
+
+    # predicate/value pairs to be produced.
+    _tuples: typing.Tuple[typing.Tuple[bsfs.schema.Predicate, typing.Any], ...]
+
+    def __init__(
+            self,
+            schema: str,
+            tuples: typing.Iterable[typing.Tuple[bsfs.URI, typing.Any]],
+            ):
+        super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + schema))
+        # NOTE: Raises a KeyError if the predicate is not part of the schema
+        self._tuples = tuple((self.schema.predicate(p_uri), value) for p_uri, value in tuples)
+        # TODO: use schema instance for value checking
+
+    def __eq__(self, other: typing.Any) -> bool:
+        return super().__eq__(other) \
+           and self._tuples == other._tuples
+
+    def __hash__(self) -> int:
+        return hash((super().__hash__(), self._tuples))
+
+    def extract(
+            self,
+            subject: node.Node,
+            content: None,
+            principals: typing.Iterable[bsfs.schema.Predicate],
+            ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+        for pred, value in self._tuples:
+            if pred in principals:
+                yield subject, pred, value
+
+## EOF ##
diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py
new file mode 100644
index 0000000..7018e12
--- /dev/null
+++ b/bsie/extractor/generic/path.py
@@ -0,0 +1,74 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import os
+import typing
+
+# bsie imports
+from bsie.base import extractor
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Path',
+    )
+
+
+## code ##
+
+class Path(extractor.Extractor):
+    """Extract information from file's path."""
+
+    CONTENT_READER = 'bsie.reader.path.Path'
+
+    # mapping from predicate to handler function.
+    _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]]
+
+    def __init__(self):
+        super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+            bse:filename rdfs:subClassOf bsfs:Predicate ;
+                rdfs:domain bsfs:File ;
+                rdfs:range xsd:string ;
+                rdfs:label "File name"^^xsd:string ;
+                schema:description "Filename of entity in some filesystem."^^xsd:string ;
+                bsfs:unique "false"^^xsd:boolean .
+            '''))
+        self._callmap = {
+            self.schema.predicate(ns.bse.filename): self.__filename,
+            }
+
+    def extract(
+            self,
+            subject: node.Node,
+            content: str,
+            principals: typing.Iterable[bsfs.schema.Predicate],
+            ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+        for pred in principals:
+            # find callback
+            clbk = self._callmap.get(pred)
+            if clbk is None:
+                continue
+            # get value
+            value = clbk(content)
+            if value is None:
+                continue
+            # produce triple
+            yield subject, pred, value
+
+    def __filename(self, path: str) -> typing.Optional[str]:
+        try:
+            return os.path.basename(path)
+        except Exception: # pylint: disable=broad-except # we explicitly want to catch everything
+            # some error, skip
+            # FIXME: some kind of error reporting (e.g. logging)?
+            # Options: (a) Fail silently (current); (b) Skip and report to log;
+            # (c) Raise ExtractorError (aborts extraction); (d) separate content type
+            # checks from basename errors (report content type errors, skip basename
+            # errors)
+            return None
+
+## EOF ##
diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py
new file mode 100644
index 0000000..0b9ce29
--- /dev/null
+++ b/bsie/extractor/generic/stat.py
@@ -0,0 +1,70 @@
+"""Extract information from the file system, such as filesize.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import os
+import typing
+
+# bsie imports
+from bsie.base import extractor
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Stat',
+    )
+
+
+## code ##
+
+class Stat(extractor.Extractor):
+    """Extract information from the file system."""
+
+    CONTENT_READER = 'bsie.reader.stat.Stat'
+
+    # mapping from predicate to handler function.
+    _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[os.stat_result], typing.Any]]
+
+    def __init__(self):
+        super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+            bse:filesize rdfs:subClassOf bsfs:Predicate ;
+                rdfs:domain bsfs:File ;
+                rdfs:range xsd:integer ;
+                rdfs:label "File size"^^xsd:string ;
+                schema:description "File size of entity in some filesystem."^^xsd:string ;
+                bsfs:unique "false"^^xsd:boolean .
+            '''))
+        self._callmap = {
+            self.schema.predicate(ns.bse.filesize): self.__filesize,
+            }
+
+    def extract(
+            self,
+            subject: node.Node,
+            content: os.stat_result,
+            principals: typing.Iterable[bsfs.schema.Predicate],
+            ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+        for pred in principals:
+            # find callback
+            clbk = self._callmap.get(pred)
+            if clbk is None:
+                continue
+            # get value
+            value = clbk(content)
+            if value is None:
+                continue
+            # produce triple
+            yield subject, pred, value
+
+    def __filesize(self, content: os.stat_result) -> typing.Optional[int]:
+        """Return the file size."""
+        try:
+            return content.st_size
+        except Exception: # pylint: disable=broad-except # we explicitly want to catch everything
+            # FIXME: some kind of error reporting (e.g. logging)
+            return None
+
+## EOF ##
diff --git a/bsie/lib/__init__.py b/bsie/lib/__init__.py
new file mode 100644
index 0000000..578c2c4
--- /dev/null
+++ b/bsie/lib/__init__.py
@@ -0,0 +1,18 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from .bsie import BSIE
+
+# exports
+__all__: typing.Sequence[str] = (
+    'BSIE',
+    )
+
+## EOF ##
diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py
new file mode 100644
index 0000000..e087fa9
--- /dev/null
+++ b/bsie/lib/bsie.py
@@ -0,0 +1,92 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsie imports
+from bsie.tools import Pipeline
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+    'BSIE',
+    )
+
+
+## code ##
+
+class BSIE():
+    """Extract triples from files.
+
+    Controls which predicates to extract (*collect*) and
+    which to not extract (*discard*). Note that this only affects
+    principal predicates not auxililary predicates like, e.g., tag labels.
+
+    """
+
+    # pipeline
+    _pipeline: Pipeline
+
+    # predicates to extract.
+    _principals: typing.Set[bsfs.URI]
+
+    # local schema.
+    _schema: bsfs.schema.Schema
+
+    def __init__(
+            self,
+            # pipeline builder.
+            pipeline: Pipeline,
+            # principals to extract at most. None implies all available w.r.t. extractors.
+            collect: typing.Optional[typing.Iterable[bsfs.URI]] = None,
+            # principals to discard.
+            discard: typing.Optional[typing.Iterable[bsfs.URI]] = None,
+            ):
+        # store pipeline
+        self._pipeline = pipeline
+        # start off with available principals
+        self._principals = {pred.uri for pred in self._pipeline.principals}
+        # limit principals to specified ones by argument.
+        if collect is not None:
+            collect = set(collect)
+            if len(collect) > 0:
+                self._principals &= collect
+        # discard principals.
+        if discard is not None:
+            self._principals -= set(discard)
+        # discard ns.bsfs.Predicate
+        self._principals.discard(ns.bsfs.Predicate)
+        # compile a schema that only contains the requested principals (and auxiliary predicates)
+        self._schema = self._pipeline.subschema(
+            self._pipeline.schema.predicate(pred) for pred in self._principals)
+
+    @property
+    def schema(self) -> bsfs.schema.Schema:
+        """Return the BSIE schema."""
+        return self._schema
+
+    @property
+    def principals(self) -> typing.Iterator[bsfs.URI]:
+        """Return an iterator to the principal predicates."""
+        return iter(self._principals)
+
+    def from_file(
+            self,
+            path: bsfs.URI,
+            principals: typing.Optional[typing.Iterable[bsfs.URI]] = None,
+            ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.URI, typing.Any]]:
+        """Produce triples for a given *path*. Limit to *principals* if given."""
+        # get requested principals.
+        principals = set(principals) if principals is not None else self._principals
+        # filter through requested principals.
+        principals &= self._principals
+        # predicate lookup
+        principals = {self.schema.predicate(pred) for pred in principals}
+        # invoke pipeline
+        yield from self._pipeline(path, principals)
+
+## EOF ##
diff --git a/bsie/reader/__init__.py b/bsie/reader/__init__.py
new file mode 100644
index 0000000..a45f22b
--- /dev/null
+++ b/bsie/reader/__init__.py
@@ -0,0 +1,19 @@
+"""The Reader classes return high-level content structures from files.
+
+The Reader fulfills two purposes:
+    First, it brokers between multiple libraries and file formats.
+    Second, it separates multiple aspects of a file into distinct content types.
+
+Often, different libraries focus on reading different types of content from a
+file. E.g. one would use different modules to read file system infos than to
+read exif or pixel data of an image. Hence, this module is organized by content
+type. Each distinct type can be implemented in a file or submodule that
+provides a Reader implementation. Through utilization of submodules, different
+file formats can be supported.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+
+## EOF ##
diff --git a/bsie/reader/path.py b/bsie/reader/path.py
new file mode 100644
index 0000000..d60f187
--- /dev/null
+++ b/bsie/reader/path.py
@@ -0,0 +1,28 @@
+"""The Path reader produces a file path.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsie imports
+from bsie.base import reader
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Path',
+    )
+
+
+## code ##
+
+class Path(reader.Reader):
+    """Return the path."""
+
+    def __call__(self, path: str) -> str:
+        return path
+
+
+## EOF ##
diff --git a/bsie/reader/stat.py b/bsie/reader/stat.py
new file mode 100644
index 0000000..fc5fb24
--- /dev/null
+++ b/bsie/reader/stat.py
@@ -0,0 +1,32 @@
+"""The Stat reader produces filesystem stat information.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import os
+import typing
+
+# bsie imports
+from bsie.base import errors, reader
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Stat',
+    )
+
+
+## code ##
+
+class Stat(reader.Reader):
+    """Read and return the filesystem's stat infos."""
+
+    def __call__(self, path: str) -> os.stat_result:
+        try:
+            return os.stat(path)
+        except Exception as err:
+            raise errors.ReaderError(path) from err
+
+
+## EOF ##
diff --git a/bsie/tools/__init__.py b/bsie/tools/__init__.py
new file mode 100644
index 0000000..803c321
--- /dev/null
+++ b/bsie/tools/__init__.py
@@ -0,0 +1,20 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from . import builder
+from .pipeline import Pipeline
+
+# exports
+__all__: typing.Sequence[str] = (
+    'builder',
+    'Pipeline',
+    )
+
+## EOF ##
diff --git a/bsie/tools/builder.py b/bsie/tools/builder.py
new file mode 100644
index 0000000..190d9bf
--- /dev/null
+++ b/bsie/tools/builder.py
@@ -0,0 +1,226 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import importlib
+import logging
+import typing
+
+# bsie imports
+from bsie import base
+from bsie.base import errors
+from bsie.utils import bsfs
+
+# inner-module imports
+from . import pipeline
+
+# exports
+__all__: typing.Sequence[str] = (
+    'ExtractorBuilder',
+    'PipelineBuilder',
+    'ReaderBuilder',
+    )
+
+
+## code ##
+
+logger = logging.getLogger(__name__)
+
+def _safe_load(module_name: str, class_name: str):
+    """Get a class from a module. Raise BuilderError if anything goes wrong."""
+    try:
+        # load the module
+        module = importlib.import_module(module_name)
+    except Exception as err:
+        # cannot import module
+        raise errors.LoaderError(f'cannot load module {module_name}') from err
+
+    try:
+        # get the class from the module
+        cls = getattr(module, class_name)
+    except Exception as err:
+        # cannot find the class
+        raise errors.LoaderError(f'cannot load class {class_name} from module {module_name}') from err
+
+    return cls
+
+
+def _unpack_name(name):
+    """Split a name into its module and class component (dot-separated)."""
+    if not isinstance(name, str):
+        raise TypeError(name)
+    if '.' not in name:
+        raise ValueError('name must be a qualified class name.')
+    module_name, class_name = name[:name.rfind('.')], name[name.rfind('.')+1:]
+    if module_name == '':
+        raise ValueError('name must be a qualified class name.')
+    return module_name, class_name
+
+
+class ReaderBuilder():
+    """Build `bsie.base.Reader` instances.
+
+    Readers are defined via their qualified class name
+    (e.g., bsie.reader.path.Path) and optional keyword
+    arguments that are passed to the constructor via
+    the *kwargs* argument (name as key, kwargs as value).
+    The ReaderBuilder keeps a cache of previously built
+    reader instances, as they are anyway built with
+    identical keyword arguments.
+
+    """
+
+    # keyword arguments
+    _kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]
+
+    # cached readers
+    _cache: typing.Dict[str, base.Reader]
+
+    def __init__(self, kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]):
+        self._kwargs = kwargs
+        self._cache = {}
+
+    def build(self, name: str) -> base.Reader:
+        """Return an instance for the qualified class name."""
+        # return cached instance
+        if name in self._cache:
+            return self._cache[name]
+
+        # check name and get module/class components
+        module_name, class_name = _unpack_name(name)
+
+        # import reader class
+        cls = _safe_load(module_name, class_name)
+
+        # get kwargs
+        kwargs = self._kwargs.get(name, {})
+        if not isinstance(kwargs, dict):
+            raise TypeError(f'expected a kwargs dict, found {bsfs.typename(kwargs)}')
+
+        try: # build, cache, and return instance
+            obj = cls(**kwargs)
+            # cache instance
+            self._cache[name] = obj
+            # return instance
+            return obj
+
+        except Exception as err:
+            raise errors.BuilderError(f'failed to build reader {name} due to {bsfs.typename(err)}: {err}') from err
+
+
+class ExtractorBuilder():
+    """Build `bsie.base.Extractor instances.
+
+    It is permissible to build multiple instances of the same extractor
+    (typically with different arguments), hence the ExtractorBuilder
+    receives a list of build specifications. Each specification is
+    a dict with a single key (extractor's qualified name) and a dict
+    to be used as keyword arguments.
+    Example: [{'bsie.extractor.generic.path.Path': {}}, ]
+
+    """
+
+    # build specifications
+    _specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]
+
+    def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]):
+        self._specs = specs
+
+    def __iter__(self) -> typing.Iterator[int]:
+        """Iterate over extractor specifications."""
+        return iter(range(len(self._specs)))
+
+    def build(self, index: int) -> base.Extractor:
+        """Return an instance of the n'th extractor (n=*index*)."""
+        # get build instructions
+        specs = self._specs[index]
+
+        # check specs structure. expecting[{name: {kwargs}}]
+        if not isinstance(specs, dict):
+            raise TypeError(f'expected a dict, found {bsfs.typename(specs)}')
+        if len(specs) != 1:
+            raise TypeError(f'expected a dict of length one, found {len(specs)}')
+
+        # get name and args from specs
+        name = next(iter(specs.keys()))
+        kwargs = specs[name]
+
+        # check kwargs structure
+        if not isinstance(kwargs, dict):
+            raise TypeError(f'expected a dict, found {bsfs.typename(kwargs)}')
+
+        # check name and get module/class components
+        module_name, class_name = _unpack_name(name)
+
+        # import extractor class
+        cls = _safe_load(module_name, class_name)
+
+        try: # build and return instance
+            return cls(**kwargs)
+
+        except Exception as err:
+            raise errors.BuilderError(f'failed to build extractor {name} due to {bsfs.typename(err)}: {err}') from err
+
+
+class PipelineBuilder():
+    """Build `bsie.tools.pipeline.Pipeline` instances."""
+
+    # Prefix to be used in the Pipeline.
+    prefix: bsfs.Namespace
+
+    # builder for Readers.
+    rbuild: ReaderBuilder
+
+    # builder for Extractors.
+    ebuild: ExtractorBuilder
+
+    def __init__(
+            self,
+            prefix: bsfs.Namespace,
+            reader_builder: ReaderBuilder,
+            extractor_builder: ExtractorBuilder,
+            ):
+        self.prefix = prefix
+        self.rbuild = reader_builder
+        self.ebuild = extractor_builder
+
+    def build(self) -> pipeline.Pipeline:
+        """Return a Pipeline instance."""
+        ext2rdr = {}
+
+        for eidx in self.ebuild:
+            # build extractor
+            try:
+                ext = self.ebuild.build(eidx)
+
+            except errors.LoaderError as err: # failed to load extractor; skip
+                logger.error('failed to load extractor: %s', err)
+                continue
+
+            except errors.BuilderError as err: # failed to build instance; skip
+                logger.error(str(err))
+                continue
+
+            try:
+                # get reader required by extractor
+                if ext.CONTENT_READER is not None:
+                    rdr = self.rbuild.build(ext.CONTENT_READER)
+                else:
+                    rdr = None
+                # store extractor
+                ext2rdr[ext] = rdr
+
+            except errors.LoaderError as err: # failed to load reader
+                logger.error('failed to load reader: %s', err)
+
+            except errors.BuilderError as err: # failed to build reader
+                logger.error(str(err))
+
+        return pipeline.Pipeline(self.prefix, ext2rdr)
+
+
+
+## EOF ##
diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py
new file mode 100644
index 0000000..20e8ddf
--- /dev/null
+++ b/bsie/tools/pipeline.py
@@ -0,0 +1,144 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+from collections import defaultdict
+import logging
+import typing
+
+# bsie imports
+from bsie import base
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Pipeline',
+    )
+
+# constants
+FILE_PREFIX = 'file#'
+
+## code ##
+
+logger = logging.getLogger(__name__)
+
+class Pipeline():
+    """Extraction pipeline to generate triples from files.
+
+    The Pipeline binds readers and extractors, and performs
+    the necessary operations to produce triples from a file.
+    It takes a best-effort approach to extract as many triples
+    as possible. Errors during the extraction are passed over
+    and reported to the log.
+
+    """
+
+    # combined extractor schemas.
+    _schema: bsfs.schema.Schema
+
+    # node prefix.
+    _prefix: bsfs.Namespace
+
+    # extractor -> reader mapping
+    _ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]]
+
+    def __init__(
+            self,
+            prefix: bsfs.Namespace,
+            ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]]
+            ):
+        # store core members
+        self._prefix = prefix + FILE_PREFIX
+        self._ext2rdr = ext2rdr
+        # compile schema from all extractors
+        self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr)
+
+    def __str__(self) -> str:
+        return bsfs.typename(self)
+
+    def __repr__(self) -> str:
+        return f'{bsfs.typename(self)}(...)'
+
+    def __hash__(self) -> int:
+        return hash((type(self), self._prefix, self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values())))
+
+    def __eq__(self, other: typing.Any) -> bool:
+        return isinstance(other, type(self)) \
+           and self._schema == other._schema \
+           and self._prefix == other._prefix \
+           and self._ext2rdr == other._ext2rdr
+
+    @property
+    def schema(self) -> bsfs.schema.Schema:
+        """Return the pipeline's schema (combined from all extractors)."""
+        return self._schema
+
+    @property
+    def principals(self) -> typing.Iterator[bsfs.schema.Predicate]:
+        """Return the principal predicates that can be extracted."""
+        return iter({pred for ext in self._ext2rdr for pred in ext.principals})
+
+    def subschema(self, principals: typing.Iterable[bsfs.schema.Predicate]) -> bsfs.schema.Schema:
+        """Return the subset of the schema that supports the given *principals*."""
+        # materialize principals
+        principals = set(principals)
+        # collect and combine schemas from extractors
+        return bsfs.schema.Schema.Union({
+            ext.schema
+            for ext
+            in self._ext2rdr
+            if not set(ext.principals).isdisjoint(principals)
+            })
+
+    def __call__(
+            self,
+            path: bsfs.URI,
+            principals: typing.Optional[typing.Iterable[bsfs.schema.Predicate]] = None,
+            ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+        """Extract triples from the file at *path*. Optionally, limit triples to *principals*."""
+        # get principals
+        principals = set(principals) if principals is not None else set(self.schema.predicates())
+
+        # get extractors
+        extractors = {ext for ext in self._ext2rdr if not set(ext.principals).isdisjoint(principals)}
+
+        # corner-case short-cut
+        if len(extractors) == 0:
+            return
+
+        # get readers -> extractors mapping
+        rdr2ext = defaultdict(set)
+        for ext in extractors:
+            rdr = self._ext2rdr[ext]
+            rdr2ext[rdr].add(ext)
+
+        # create subject for file
+        uuid = bsfs.uuid.UCID.from_path(path)
+        subject = node.Node(ns.bsfs.File, self._prefix[uuid])
+
+        # extract information
+        for rdr, extrs in rdr2ext.items():
+            try:
+                # get content
+                content = rdr(path) if rdr is not None else None
+
+                # apply extractors on this content
+                for ext in extrs:
+                    try:
+                        # get predicate/value tuples
+                        for subject, pred, value in ext.extract(subject, content, principals):
+                            yield subject, pred, value
+
+                    except base.errors.ExtractorError as err:
+                        # critical extractor failure.
+                        logger.error('%s failed to extract triples from content: %s', ext, err)
+
+            except base.errors.ReaderError as err:
+                # failed to read any content. skip.
+                logger.error('%s failed to read content: %s', rdr, err)
+
+
+## EOF ##
diff --git a/bsie/utils/__init__.py b/bsie/utils/__init__.py
new file mode 100644
index 0000000..bd22236
--- /dev/null
+++ b/bsie/utils/__init__.py
@@ -0,0 +1,22 @@
+"""Common tools and definitions.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from . import bsfs
+from . import namespaces as ns
+from . import node
+
+# exports
+__all__: typing.Sequence[str] = (
+    'bsfs',
+    'node',
+    'ns',
+    )
+
+## EOF ##
diff --git a/bsie/utils/bsfs.py b/bsie/utils/bsfs.py
new file mode 100644
index 0000000..0b88479
--- /dev/null
+++ b/bsie/utils/bsfs.py
@@ -0,0 +1,27 @@
+"""BSFS bridge, provides BSFS bindings for BSIE.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsfs imports
+from bsfs import Open, schema
+from bsfs.apps.init import init_sparql_store
+from bsfs.namespace import Namespace
+from bsfs.utils import URI, typename, uuid
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Namespace',
+    'Open',
+    'URI',
+    'init_sparql_store',
+    'schema',
+    'typename',
+    'uuid',
+    )
+
+## EOF ##
diff --git a/bsie/utils/namespaces.py b/bsie/utils/namespaces.py
new file mode 100644
index 0000000..a29fc1b
--- /dev/null
+++ b/bsie/utils/namespaces.py
@@ -0,0 +1,27 @@
+"""Default namespaces used throughout BSIE.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from . import bsfs as _bsfs
+
+# constants
+bse = _bsfs.Namespace('http://bsfs.ai/schema/Entity')
+bsfs = _bsfs.Namespace('http://bsfs.ai/schema', fsep='/')
+bsm = _bsfs.Namespace('http://bsfs.ai/schema/Meta')
+xsd = _bsfs.Namespace('http://www.w3.org/2001/XMLSchema')
+
+# export
+__all__: typing.Sequence[str] = (
+    'bse',
+    'bsfs',
+    'bsm',
+    'xsd',
+    )
+
+## EOF ##
diff --git a/bsie/utils/node.py b/bsie/utils/node.py
new file mode 100644
index 0000000..ecf39cd
--- /dev/null
+++ b/bsie/utils/node.py
@@ -0,0 +1,53 @@
+"""Lighweight Node to bridge to BSFS.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsie imports
+from bsie.utils import bsfs
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Node',
+    )
+
+
+## code ##
+
+class Node():
+    """Lightweight Node, disconnected from any bsfs structures."""
+
+    # node type.
+    node_type: bsfs.URI
+
+    # node URI.
+    uri: bsfs.URI
+
+    def __init__(
+            self,
+            node_type: bsfs.URI,
+            uri: bsfs.URI,
+            ):
+        # assign members
+        self.node_type = bsfs.URI(node_type)
+        self.uri = bsfs.URI(uri)
+
+    def __eq__(self, other: typing.Any) -> bool:
+        return isinstance(other, Node) \
+            and other.node_type == self.node_type \
+            and other.uri == self.uri
+
+    def __hash__(self) -> int:
+        return hash((type(self), self.node_type, self.uri))
+
+    def __str__(self) -> str:
+        return f'{bsfs.typename(self)}({self.node_type}, {self.uri})'
+
+    def __repr__(self) -> str:
+        return f'{bsfs.typename(self)}({self.node_type}, {self.uri})'
+
+## EOF ##
author	Matthias Baumgartner <dev@igsor.net>	2022-12-18 14:22:31 +0100
committer	Matthias Baumgartner <dev@igsor.net>	2022-12-18 14:22:31 +0100
commit	7582c280ad5324a2f0427999911c7e7abc14a6ab (patch)
tree	0a59bbfe1c44d3497daad9f25ff9e7eb2bf9eb82 /bsie
parent	cb49e4567a18de6851286ff672e54f9a91865fe9 (diff)
parent	057e09d6537bf5c39815661a75819081e3e5fda7 (diff)
download	bsie-7582c280ad5324a2f0427999911c7e7abc14a6ab.tar.gz bsie-7582c280ad5324a2f0427999911c7e7abc14a6ab.tar.bz2 bsie-7582c280ad5324a2f0427999911c7e7abc14a6ab.zip