diff options
author | Matthias Baumgartner <dev@igsor.net> | 2022-12-18 14:22:31 +0100 |
---|---|---|
committer | Matthias Baumgartner <dev@igsor.net> | 2022-12-18 14:22:31 +0100 |
commit | 7582c280ad5324a2f0427999911c7e7abc14a6ab (patch) | |
tree | 0a59bbfe1c44d3497daad9f25ff9e7eb2bf9eb82 /bsie | |
parent | cb49e4567a18de6851286ff672e54f9a91865fe9 (diff) | |
parent | 057e09d6537bf5c39815661a75819081e3e5fda7 (diff) | |
download | bsie-7582c280ad5324a2f0427999911c7e7abc14a6ab.tar.gz bsie-7582c280ad5324a2f0427999911c7e7abc14a6ab.tar.bz2 bsie-7582c280ad5324a2f0427999911c7e7abc14a6ab.zip |
Merge branch 'develop' into main
Diffstat (limited to 'bsie')
-rw-r--r-- | bsie/__init__.py | 18 | ||||
-rw-r--r-- | bsie/apps/__init__.py | 20 | ||||
-rw-r--r-- | bsie/apps/index.py | 121 | ||||
-rw-r--r-- | bsie/apps/info.py | 74 | ||||
-rw-r--r-- | bsie/base/__init__.py | 24 | ||||
-rw-r--r-- | bsie/base/errors.py | 42 | ||||
-rw-r--r-- | bsie/base/extractor.py | 103 | ||||
-rw-r--r-- | bsie/base/reader.py | 47 | ||||
-rw-r--r-- | bsie/extractor/__init__.py | 15 | ||||
-rw-r--r-- | bsie/extractor/generic/__init__.py | 16 | ||||
-rw-r--r-- | bsie/extractor/generic/constant.py | 57 | ||||
-rw-r--r-- | bsie/extractor/generic/path.py | 74 | ||||
-rw-r--r-- | bsie/extractor/generic/stat.py | 70 | ||||
-rw-r--r-- | bsie/lib/__init__.py | 18 | ||||
-rw-r--r-- | bsie/lib/bsie.py | 92 | ||||
-rw-r--r-- | bsie/reader/__init__.py | 19 | ||||
-rw-r--r-- | bsie/reader/path.py | 28 | ||||
-rw-r--r-- | bsie/reader/stat.py | 32 | ||||
-rw-r--r-- | bsie/tools/__init__.py | 20 | ||||
-rw-r--r-- | bsie/tools/builder.py | 226 | ||||
-rw-r--r-- | bsie/tools/pipeline.py | 144 | ||||
-rw-r--r-- | bsie/utils/__init__.py | 22 | ||||
-rw-r--r-- | bsie/utils/bsfs.py | 27 | ||||
-rw-r--r-- | bsie/utils/namespaces.py | 27 | ||||
-rw-r--r-- | bsie/utils/node.py | 53 |
25 files changed, 1389 insertions, 0 deletions
diff --git a/bsie/__init__.py b/bsie/__init__.py new file mode 100644 index 0000000..8d2308c --- /dev/null +++ b/bsie/__init__.py @@ -0,0 +1,18 @@ +"""The BSIE module extracts triples from files for insertion into a BSFS storage. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import collections +import typing + +# constants +T_VERSION_INFO = collections.namedtuple('T_VERSION_INFO', ('major', 'minor', 'micro')) # pylint: disable=invalid-name +version_info = T_VERSION_INFO(0, 0, 1) + +# exports +__all__: typing.Sequence[str] = [] + +## EOF ## diff --git a/bsie/apps/__init__.py b/bsie/apps/__init__.py new file mode 100644 index 0000000..a548c3c --- /dev/null +++ b/bsie/apps/__init__.py @@ -0,0 +1,20 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from .index import main as index +from .info import main as info + +# exports +__all__: typing.Sequence[str] = ( + 'index', + 'info', + ) + +## EOF ## diff --git a/bsie/apps/index.py b/bsie/apps/index.py new file mode 100644 index 0000000..1dbfdd8 --- /dev/null +++ b/bsie/apps/index.py @@ -0,0 +1,121 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import argparse +import os +import typing + +# bsie imports +from bsie.base import errors +from bsie.lib import BSIE +from bsie.tools import builder +from bsie.utils import bsfs + +# exports +__all__: typing.Sequence[str] = ( + 'main', + ) + + +## code ## + +def main(argv): + """Index files or directories into BSFS.""" + parser = argparse.ArgumentParser(description=main.__doc__, prog='index') + parser.add_argument('--user', type=bsfs.URI, default=bsfs.URI('http://example.com/me'), + help='') + parser.add_argument('--collect', action='append', default=[], + help='') + parser.add_argument('--discard', action='append', default=[], + help='') + parser.add_argument('-r', '--recursive', action='store_true', default=False, + help='') + parser.add_argument('--follow', action='store_true', default=False, + help='') + parser.add_argument('--print', action='store_true', default=False, + help='') + parser.add_argument('input_file', nargs=argparse.REMAINDER, + help='') + args = parser.parse_args(argv) + + # FIXME: Read reader/extractor configs from a config file + # reader builder + rbuild = builder.ReaderBuilder({}) + # extractor builder + ebuild = builder.ExtractorBuilder([ + {'bsie.extractor.generic.path.Path': {}}, + {'bsie.extractor.generic.stat.Stat': {}}, + {'bsie.extractor.generic.constant.Constant': dict( + tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')], + schema=''' + bse:author rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + bsfs:unique "true"^^xsd:boolean . + ''', + )}, + ]) + # pipeline builder + pbuild = builder.PipelineBuilder( + bsfs.Namespace(args.user + ('/' if not args.user.endswith('/') else '')), + rbuild, + ebuild, + ) + + # build pipeline + pipeline = pbuild.build() + # build BSIE frontend + bsie = BSIE(pipeline, args.collect, args.discard) + + + def walk(handle): + """Walk through given input files.""" + # FIXME: collect all triples by node, set all predicates at once + # FIXME: simplify code (below but maybe also above) + # FIXME: How to handle dependencies between data? + # E.g. do I still want to link to a tag despite not being permitted to set its label? + # FIXME: node renaming? + + # index input paths + for path in args.input_file: + if os.path.isdir(path) and args.recursive: + for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow): + for filename in filenames: + for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)): + handle(node, pred, value) + elif os.path.isfile(path): + for node, pred, value in bsie.from_file(path): + handle(node, pred, value) + else: + raise errors.UnreachableError() + + + if args.print: + walk(print) + return None + + # initialize bsfs + # NOTE: With presistent storages, the schema migration will be a seaparte operation. + # Here, we'd simply examine the schema and potentially discard more predicates. + store = bsfs.Open(bsfs.init_sparql_store(args.user)) + store.migrate(bsie.schema) + # process files + def handle(node, pred, value): + store.node(node.node_type, node.uri).set(pred.uri, value) + walk(handle) + # return store + return store + + + +## main ## + +if __name__ == '__main__': + import sys + main(sys.argv[1:]) + +## EOF ## diff --git a/bsie/apps/info.py b/bsie/apps/info.py new file mode 100644 index 0000000..eaf1f71 --- /dev/null +++ b/bsie/apps/info.py @@ -0,0 +1,74 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import argparse +import sys +import typing + +# bsie imports +from bsie.base import errors +from bsie.tools import builder +from bsie.utils import bsfs + +# exports +__all__: typing.Sequence[str] = ( + 'main', + ) + + +## code ## + +def main(argv): + """Show information from BSIE.""" + parser = argparse.ArgumentParser(description=main.__doc__, prog='info') + parser.add_argument('what', choices=('predicates', ), + help='Select what information to show.') + args = parser.parse_args(argv) + + # FIXME: Read reader/extractor configs from a config file + # reader builder + rbuild = builder.ReaderBuilder({}) + # extractor builder + ebuild = builder.ExtractorBuilder([ + {'bsie.extractor.generic.path.Path': {}}, + {'bsie.extractor.generic.stat.Stat': {}}, + {'bsie.extractor.generic.constant.Constant': dict( + tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')], + schema=''' + bse:author rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + bsfs:unique "true"^^xsd:boolean . + ''', + )}, + ]) + # pipeline builder + pbuild = builder.PipelineBuilder( + bsfs.Namespace('http://example.com/me/'), # not actually used + rbuild, + ebuild, + ) + + # build pipeline + pipeline = pbuild.build() + + # show info + if args.what == 'predicates': + # show predicates + for pred in pipeline.schema.predicates(): + print(pred.uri) + else: + # args.what is already checked by argparse + raise errors.UnreachableError() + + +## main ## + +if __name__ == '__main__': + main(sys.argv[1:]) + +## EOF ## diff --git a/bsie/base/__init__.py b/bsie/base/__init__.py new file mode 100644 index 0000000..0d362cd --- /dev/null +++ b/bsie/base/__init__.py @@ -0,0 +1,24 @@ +"""The base module defines the BSIE interfaces. + +You'll mostly find abstract classes here. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from . import errors +from .extractor import Extractor +from .reader import Reader + +# exports +__all__: typing.Sequence[str] = ( + 'Extractor', + 'Reader', + 'errors', + ) + +## EOF ## diff --git a/bsie/base/errors.py b/bsie/base/errors.py new file mode 100644 index 0000000..dc3c30e --- /dev/null +++ b/bsie/base/errors.py @@ -0,0 +1,42 @@ +"""Common BSIE exceptions. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# exports +__all__: typing.Sequence[str] = ( + 'BuilderError', + 'ExtractorError', + 'LoaderError', + 'ReaderError', + ) + + +## code ## + +class _BSIEError(Exception): + """Generic BSIE error.""" + +class BuilderError(_BSIEError): + """The Builder failed to create an instance.""" + +class LoaderError(BuilderError): + """Failed to load a module or class.""" + +class ExtractorError(_BSIEError): + """The Extractor failed to process the given content.""" + +class ReaderError(_BSIEError): + """The Reader failed to read the given file.""" + +class ProgrammingError(_BSIEError): + """An assertion-like error that indicates a code-base issue.""" + +class UnreachableError(ProgrammingError): + """Bravo, you've reached a point in code that should logically not be reachable.""" + +## EOF ## diff --git a/bsie/base/extractor.py b/bsie/base/extractor.py new file mode 100644 index 0000000..c44021b --- /dev/null +++ b/bsie/base/extractor.py @@ -0,0 +1,103 @@ +"""The Extractor classes transform content into triples. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import abc +import typing + +# bsie imports +from bsie.utils import bsfs, node, ns + +# exports +__all__: typing.Sequence[str] = ( + 'Extractor', + ) + +# constants + +# essential definitions typically used in extractor schemas. +# NOTE: This preamble is only for convenience; Each Extractor must implement its use, if so desired. +SCHEMA_PREAMBLE = ''' + # common external prefixes + prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> + prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> + prefix xsd: <http://www.w3.org/2001/XMLSchema#> + prefix schema: <http://schema.org/> + + # common bsfs prefixes + prefix bsfs: <http://bsfs.ai/schema/> + prefix bse: <http://bsfs.ai/schema/Entity#> + + # essential nodes + bsfs:Entity rdfs:subClassOf bsfs:Node . + bsfs:File rdfs:subClassOf bsfs:Entity . + + # common definitions + xsd:string rdfs:subClassOf bsfs:Literal . + xsd:integer rdfs:subClassOf bsfs:Literal . + + ''' + + +## code ## + +class Extractor(abc.ABC): + """Produce (subject, predicate, value)-triples from some content. + The Extractor produces princpal predicates that provide information + about the content itself (i.e., triples that include the subject), + and may also generate triples with auxiliary predicates if the + extracted value is a node itself. + """ + + # what type of content is expected (i.e. reader subclass). + CONTENT_READER: typing.Optional[str] = None + + # extractor schema. + _schema: bsfs.schema.Schema + + def __init__(self, schema: bsfs.schema.Schema): + self._schema = schema + + def __str__(self) -> str: + return bsfs.typename(self) + + def __repr__(self) -> str: + return f'{bsfs.typename(self)}()' + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) \ + and self.CONTENT_READER == other.CONTENT_READER \ + and self.schema == other.schema + + def __hash__(self) -> int: + return hash((type(self), self.CONTENT_READER, self.schema)) + + @property + def schema(self) -> bsfs.schema.Schema: + """Return the extractor's schema.""" + return self._schema + + @property + def principals(self) -> typing.Iterator[bsfs.schema.Predicate]: + """Return the principal predicates, i.e., relations from/to the extraction subject.""" + ent = self.schema.node(ns.bsfs.Entity) + return ( + pred + for pred + in self.schema.predicates() + if pred.domain <= ent or (pred.range is not None and pred.range <= ent) + ) + + @abc.abstractmethod + def extract( + self, + subject: node.Node, + content: typing.Any, + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + """Return (node, predicate, value) triples.""" + +## EOF ## diff --git a/bsie/base/reader.py b/bsie/base/reader.py new file mode 100644 index 0000000..cbabd36 --- /dev/null +++ b/bsie/base/reader.py @@ -0,0 +1,47 @@ +"""The Reader classes return high-level content structures from files. + +The Reader fulfills two purposes: + First, it brokers between multiple libraries and file formats. + Second, it separates multiple aspects of a file into distinct content types. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import abc +import typing + +# bsie imports +from bsie.utils import bsfs + +# exports +__all__: typing.Sequence[str] = ( + 'Reader', + ) + + +## code ## + +class Reader(abc.ABC): + """Read and return some content from a file.""" + + def __str__(self) -> str: + return bsfs.typename(self) + + def __repr__(self) -> str: + return f'{bsfs.typename(self)}()' + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) + + def __hash__(self) -> int: + return hash(type(self)) + + @abc.abstractmethod + def __call__(self, path: bsfs.URI) -> typing.Any: + """Return some content of the file at *path*. + Raises a `ReaderError` if the reader cannot make sense of the file format. + """ + +## EOF ## diff --git a/bsie/extractor/__init__.py b/bsie/extractor/__init__.py new file mode 100644 index 0000000..ef31343 --- /dev/null +++ b/bsie/extractor/__init__.py @@ -0,0 +1,15 @@ +"""Extractors produce triples from some content. + +Each Extractor class is linked to the Reader class whose content it requires. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# exports +__all__: typing.Sequence[str] = [] + +## EOF ## diff --git a/bsie/extractor/generic/__init__.py b/bsie/extractor/generic/__init__.py new file mode 100644 index 0000000..0cb7e7f --- /dev/null +++ b/bsie/extractor/generic/__init__.py @@ -0,0 +1,16 @@ +"""Generic extractors focus on information that is typically available on all +files. Examples include file system information (file name and size, mime type, +etc.) and information that is independent of the actual file (constant triples, +host platform infos, current time, etc.). + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# exports +__all__: typing.Sequence[str] = [] + +## EOF ## diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py new file mode 100644 index 0000000..11384e6 --- /dev/null +++ b/bsie/extractor/generic/constant.py @@ -0,0 +1,57 @@ +"""The Constant extractor produces pre-specified triples. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# bsie imports +from bsie.base import extractor +from bsie.utils import bsfs, node + +# exports +__all__: typing.Sequence[str] = ( + 'Constant', + ) + + +## code ## + +class Constant(extractor.Extractor): + """Extract information from file's path.""" + + CONTENT_READER = None + + # predicate/value pairs to be produced. + _tuples: typing.Tuple[typing.Tuple[bsfs.schema.Predicate, typing.Any], ...] + + def __init__( + self, + schema: str, + tuples: typing.Iterable[typing.Tuple[bsfs.URI, typing.Any]], + ): + super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + schema)) + # NOTE: Raises a KeyError if the predicate is not part of the schema + self._tuples = tuple((self.schema.predicate(p_uri), value) for p_uri, value in tuples) + # TODO: use schema instance for value checking + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) \ + and self._tuples == other._tuples + + def __hash__(self) -> int: + return hash((super().__hash__(), self._tuples)) + + def extract( + self, + subject: node.Node, + content: None, + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + for pred, value in self._tuples: + if pred in principals: + yield subject, pred, value + +## EOF ## diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py new file mode 100644 index 0000000..7018e12 --- /dev/null +++ b/bsie/extractor/generic/path.py @@ -0,0 +1,74 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import os +import typing + +# bsie imports +from bsie.base import extractor +from bsie.utils import bsfs, node, ns + +# exports +__all__: typing.Sequence[str] = ( + 'Path', + ) + + +## code ## + +class Path(extractor.Extractor): + """Extract information from file's path.""" + + CONTENT_READER = 'bsie.reader.path.Path' + + # mapping from predicate to handler function. + _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]] + + def __init__(self): + super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' + bse:filename rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:File ; + rdfs:range xsd:string ; + rdfs:label "File name"^^xsd:string ; + schema:description "Filename of entity in some filesystem."^^xsd:string ; + bsfs:unique "false"^^xsd:boolean . + ''')) + self._callmap = { + self.schema.predicate(ns.bse.filename): self.__filename, + } + + def extract( + self, + subject: node.Node, + content: str, + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + for pred in principals: + # find callback + clbk = self._callmap.get(pred) + if clbk is None: + continue + # get value + value = clbk(content) + if value is None: + continue + # produce triple + yield subject, pred, value + + def __filename(self, path: str) -> typing.Optional[str]: + try: + return os.path.basename(path) + except Exception: # pylint: disable=broad-except # we explicitly want to catch everything + # some error, skip + # FIXME: some kind of error reporting (e.g. logging)? + # Options: (a) Fail silently (current); (b) Skip and report to log; + # (c) Raise ExtractorError (aborts extraction); (d) separate content type + # checks from basename errors (report content type errors, skip basename + # errors) + return None + +## EOF ## diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py new file mode 100644 index 0000000..0b9ce29 --- /dev/null +++ b/bsie/extractor/generic/stat.py @@ -0,0 +1,70 @@ +"""Extract information from the file system, such as filesize. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import os +import typing + +# bsie imports +from bsie.base import extractor +from bsie.utils import bsfs, node, ns + +# exports +__all__: typing.Sequence[str] = ( + 'Stat', + ) + + +## code ## + +class Stat(extractor.Extractor): + """Extract information from the file system.""" + + CONTENT_READER = 'bsie.reader.stat.Stat' + + # mapping from predicate to handler function. + _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[os.stat_result], typing.Any]] + + def __init__(self): + super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' + bse:filesize rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:File ; + rdfs:range xsd:integer ; + rdfs:label "File size"^^xsd:string ; + schema:description "File size of entity in some filesystem."^^xsd:string ; + bsfs:unique "false"^^xsd:boolean . + ''')) + self._callmap = { + self.schema.predicate(ns.bse.filesize): self.__filesize, + } + + def extract( + self, + subject: node.Node, + content: os.stat_result, + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + for pred in principals: + # find callback + clbk = self._callmap.get(pred) + if clbk is None: + continue + # get value + value = clbk(content) + if value is None: + continue + # produce triple + yield subject, pred, value + + def __filesize(self, content: os.stat_result) -> typing.Optional[int]: + """Return the file size.""" + try: + return content.st_size + except Exception: # pylint: disable=broad-except # we explicitly want to catch everything + # FIXME: some kind of error reporting (e.g. logging) + return None + +## EOF ## diff --git a/bsie/lib/__init__.py b/bsie/lib/__init__.py new file mode 100644 index 0000000..578c2c4 --- /dev/null +++ b/bsie/lib/__init__.py @@ -0,0 +1,18 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from .bsie import BSIE + +# exports +__all__: typing.Sequence[str] = ( + 'BSIE', + ) + +## EOF ## diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py new file mode 100644 index 0000000..e087fa9 --- /dev/null +++ b/bsie/lib/bsie.py @@ -0,0 +1,92 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# bsie imports +from bsie.tools import Pipeline +from bsie.utils import bsfs, node, ns + +# exports +__all__: typing.Sequence[str] = ( + 'BSIE', + ) + + +## code ## + +class BSIE(): + """Extract triples from files. + + Controls which predicates to extract (*collect*) and + which to not extract (*discard*). Note that this only affects + principal predicates not auxililary predicates like, e.g., tag labels. + + """ + + # pipeline + _pipeline: Pipeline + + # predicates to extract. + _principals: typing.Set[bsfs.URI] + + # local schema. + _schema: bsfs.schema.Schema + + def __init__( + self, + # pipeline builder. + pipeline: Pipeline, + # principals to extract at most. None implies all available w.r.t. extractors. + collect: typing.Optional[typing.Iterable[bsfs.URI]] = None, + # principals to discard. + discard: typing.Optional[typing.Iterable[bsfs.URI]] = None, + ): + # store pipeline + self._pipeline = pipeline + # start off with available principals + self._principals = {pred.uri for pred in self._pipeline.principals} + # limit principals to specified ones by argument. + if collect is not None: + collect = set(collect) + if len(collect) > 0: + self._principals &= collect + # discard principals. + if discard is not None: + self._principals -= set(discard) + # discard ns.bsfs.Predicate + self._principals.discard(ns.bsfs.Predicate) + # compile a schema that only contains the requested principals (and auxiliary predicates) + self._schema = self._pipeline.subschema( + self._pipeline.schema.predicate(pred) for pred in self._principals) + + @property + def schema(self) -> bsfs.schema.Schema: + """Return the BSIE schema.""" + return self._schema + + @property + def principals(self) -> typing.Iterator[bsfs.URI]: + """Return an iterator to the principal predicates.""" + return iter(self._principals) + + def from_file( + self, + path: bsfs.URI, + principals: typing.Optional[typing.Iterable[bsfs.URI]] = None, + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.URI, typing.Any]]: + """Produce triples for a given *path*. Limit to *principals* if given.""" + # get requested principals. + principals = set(principals) if principals is not None else self._principals + # filter through requested principals. + principals &= self._principals + # predicate lookup + principals = {self.schema.predicate(pred) for pred in principals} + # invoke pipeline + yield from self._pipeline(path, principals) + +## EOF ## diff --git a/bsie/reader/__init__.py b/bsie/reader/__init__.py new file mode 100644 index 0000000..a45f22b --- /dev/null +++ b/bsie/reader/__init__.py @@ -0,0 +1,19 @@ +"""The Reader classes return high-level content structures from files. + +The Reader fulfills two purposes: + First, it brokers between multiple libraries and file formats. + Second, it separates multiple aspects of a file into distinct content types. + +Often, different libraries focus on reading different types of content from a +file. E.g. one would use different modules to read file system infos than to +read exif or pixel data of an image. Hence, this module is organized by content +type. Each distinct type can be implemented in a file or submodule that +provides a Reader implementation. Through utilization of submodules, different +file formats can be supported. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" + +## EOF ## diff --git a/bsie/reader/path.py b/bsie/reader/path.py new file mode 100644 index 0000000..d60f187 --- /dev/null +++ b/bsie/reader/path.py @@ -0,0 +1,28 @@ +"""The Path reader produces a file path. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# bsie imports +from bsie.base import reader + +# exports +__all__: typing.Sequence[str] = ( + 'Path', + ) + + +## code ## + +class Path(reader.Reader): + """Return the path.""" + + def __call__(self, path: str) -> str: + return path + + +## EOF ## diff --git a/bsie/reader/stat.py b/bsie/reader/stat.py new file mode 100644 index 0000000..fc5fb24 --- /dev/null +++ b/bsie/reader/stat.py @@ -0,0 +1,32 @@ +"""The Stat reader produces filesystem stat information. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import os +import typing + +# bsie imports +from bsie.base import errors, reader + +# exports +__all__: typing.Sequence[str] = ( + 'Stat', + ) + + +## code ## + +class Stat(reader.Reader): + """Read and return the filesystem's stat infos.""" + + def __call__(self, path: str) -> os.stat_result: + try: + return os.stat(path) + except Exception as err: + raise errors.ReaderError(path) from err + + +## EOF ## diff --git a/bsie/tools/__init__.py b/bsie/tools/__init__.py new file mode 100644 index 0000000..803c321 --- /dev/null +++ b/bsie/tools/__init__.py @@ -0,0 +1,20 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from . import builder +from .pipeline import Pipeline + +# exports +__all__: typing.Sequence[str] = ( + 'builder', + 'Pipeline', + ) + +## EOF ## diff --git a/bsie/tools/builder.py b/bsie/tools/builder.py new file mode 100644 index 0000000..190d9bf --- /dev/null +++ b/bsie/tools/builder.py @@ -0,0 +1,226 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import importlib +import logging +import typing + +# bsie imports +from bsie import base +from bsie.base import errors +from bsie.utils import bsfs + +# inner-module imports +from . import pipeline + +# exports +__all__: typing.Sequence[str] = ( + 'ExtractorBuilder', + 'PipelineBuilder', + 'ReaderBuilder', + ) + + +## code ## + +logger = logging.getLogger(__name__) + +def _safe_load(module_name: str, class_name: str): + """Get a class from a module. Raise BuilderError if anything goes wrong.""" + try: + # load the module + module = importlib.import_module(module_name) + except Exception as err: + # cannot import module + raise errors.LoaderError(f'cannot load module {module_name}') from err + + try: + # get the class from the module + cls = getattr(module, class_name) + except Exception as err: + # cannot find the class + raise errors.LoaderError(f'cannot load class {class_name} from module {module_name}') from err + + return cls + + +def _unpack_name(name): + """Split a name into its module and class component (dot-separated).""" + if not isinstance(name, str): + raise TypeError(name) + if '.' not in name: + raise ValueError('name must be a qualified class name.') + module_name, class_name = name[:name.rfind('.')], name[name.rfind('.')+1:] + if module_name == '': + raise ValueError('name must be a qualified class name.') + return module_name, class_name + + +class ReaderBuilder(): + """Build `bsie.base.Reader` instances. + + Readers are defined via their qualified class name + (e.g., bsie.reader.path.Path) and optional keyword + arguments that are passed to the constructor via + the *kwargs* argument (name as key, kwargs as value). + The ReaderBuilder keeps a cache of previously built + reader instances, as they are anyway built with + identical keyword arguments. + + """ + + # keyword arguments + _kwargs: typing.Dict[str, typing.Dict[str, typing.Any]] + + # cached readers + _cache: typing.Dict[str, base.Reader] + + def __init__(self, kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]): + self._kwargs = kwargs + self._cache = {} + + def build(self, name: str) -> base.Reader: + """Return an instance for the qualified class name.""" + # return cached instance + if name in self._cache: + return self._cache[name] + + # check name and get module/class components + module_name, class_name = _unpack_name(name) + + # import reader class + cls = _safe_load(module_name, class_name) + + # get kwargs + kwargs = self._kwargs.get(name, {}) + if not isinstance(kwargs, dict): + raise TypeError(f'expected a kwargs dict, found {bsfs.typename(kwargs)}') + + try: # build, cache, and return instance + obj = cls(**kwargs) + # cache instance + self._cache[name] = obj + # return instance + return obj + + except Exception as err: + raise errors.BuilderError(f'failed to build reader {name} due to {bsfs.typename(err)}: {err}') from err + + +class ExtractorBuilder(): + """Build `bsie.base.Extractor instances. + + It is permissible to build multiple instances of the same extractor + (typically with different arguments), hence the ExtractorBuilder + receives a list of build specifications. Each specification is + a dict with a single key (extractor's qualified name) and a dict + to be used as keyword arguments. + Example: [{'bsie.extractor.generic.path.Path': {}}, ] + + """ + + # build specifications + _specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]] + + def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]): + self._specs = specs + + def __iter__(self) -> typing.Iterator[int]: + """Iterate over extractor specifications.""" + return iter(range(len(self._specs))) + + def build(self, index: int) -> base.Extractor: + """Return an instance of the n'th extractor (n=*index*).""" + # get build instructions + specs = self._specs[index] + + # check specs structure. expecting[{name: {kwargs}}] + if not isinstance(specs, dict): + raise TypeError(f'expected a dict, found {bsfs.typename(specs)}') + if len(specs) != 1: + raise TypeError(f'expected a dict of length one, found {len(specs)}') + + # get name and args from specs + name = next(iter(specs.keys())) + kwargs = specs[name] + + # check kwargs structure + if not isinstance(kwargs, dict): + raise TypeError(f'expected a dict, found {bsfs.typename(kwargs)}') + + # check name and get module/class components + module_name, class_name = _unpack_name(name) + + # import extractor class + cls = _safe_load(module_name, class_name) + + try: # build and return instance + return cls(**kwargs) + + except Exception as err: + raise errors.BuilderError(f'failed to build extractor {name} due to {bsfs.typename(err)}: {err}') from err + + +class PipelineBuilder(): + """Build `bsie.tools.pipeline.Pipeline` instances.""" + + # Prefix to be used in the Pipeline. + prefix: bsfs.Namespace + + # builder for Readers. + rbuild: ReaderBuilder + + # builder for Extractors. + ebuild: ExtractorBuilder + + def __init__( + self, + prefix: bsfs.Namespace, + reader_builder: ReaderBuilder, + extractor_builder: ExtractorBuilder, + ): + self.prefix = prefix + self.rbuild = reader_builder + self.ebuild = extractor_builder + + def build(self) -> pipeline.Pipeline: + """Return a Pipeline instance.""" + ext2rdr = {} + + for eidx in self.ebuild: + # build extractor + try: + ext = self.ebuild.build(eidx) + + except errors.LoaderError as err: # failed to load extractor; skip + logger.error('failed to load extractor: %s', err) + continue + + except errors.BuilderError as err: # failed to build instance; skip + logger.error(str(err)) + continue + + try: + # get reader required by extractor + if ext.CONTENT_READER is not None: + rdr = self.rbuild.build(ext.CONTENT_READER) + else: + rdr = None + # store extractor + ext2rdr[ext] = rdr + + except errors.LoaderError as err: # failed to load reader + logger.error('failed to load reader: %s', err) + + except errors.BuilderError as err: # failed to build reader + logger.error(str(err)) + + return pipeline.Pipeline(self.prefix, ext2rdr) + + + +## EOF ## diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py new file mode 100644 index 0000000..20e8ddf --- /dev/null +++ b/bsie/tools/pipeline.py @@ -0,0 +1,144 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +from collections import defaultdict +import logging +import typing + +# bsie imports +from bsie import base +from bsie.utils import bsfs, node, ns + +# exports +__all__: typing.Sequence[str] = ( + 'Pipeline', + ) + +# constants +FILE_PREFIX = 'file#' + +## code ## + +logger = logging.getLogger(__name__) + +class Pipeline(): + """Extraction pipeline to generate triples from files. + + The Pipeline binds readers and extractors, and performs + the necessary operations to produce triples from a file. + It takes a best-effort approach to extract as many triples + as possible. Errors during the extraction are passed over + and reported to the log. + + """ + + # combined extractor schemas. + _schema: bsfs.schema.Schema + + # node prefix. + _prefix: bsfs.Namespace + + # extractor -> reader mapping + _ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]] + + def __init__( + self, + prefix: bsfs.Namespace, + ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]] + ): + # store core members + self._prefix = prefix + FILE_PREFIX + self._ext2rdr = ext2rdr + # compile schema from all extractors + self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr) + + def __str__(self) -> str: + return bsfs.typename(self) + + def __repr__(self) -> str: + return f'{bsfs.typename(self)}(...)' + + def __hash__(self) -> int: + return hash((type(self), self._prefix, self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values()))) + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) \ + and self._schema == other._schema \ + and self._prefix == other._prefix \ + and self._ext2rdr == other._ext2rdr + + @property + def schema(self) -> bsfs.schema.Schema: + """Return the pipeline's schema (combined from all extractors).""" + return self._schema + + @property + def principals(self) -> typing.Iterator[bsfs.schema.Predicate]: + """Return the principal predicates that can be extracted.""" + return iter({pred for ext in self._ext2rdr for pred in ext.principals}) + + def subschema(self, principals: typing.Iterable[bsfs.schema.Predicate]) -> bsfs.schema.Schema: + """Return the subset of the schema that supports the given *principals*.""" + # materialize principals + principals = set(principals) + # collect and combine schemas from extractors + return bsfs.schema.Schema.Union({ + ext.schema + for ext + in self._ext2rdr + if not set(ext.principals).isdisjoint(principals) + }) + + def __call__( + self, + path: bsfs.URI, + principals: typing.Optional[typing.Iterable[bsfs.schema.Predicate]] = None, + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + """Extract triples from the file at *path*. Optionally, limit triples to *principals*.""" + # get principals + principals = set(principals) if principals is not None else set(self.schema.predicates()) + + # get extractors + extractors = {ext for ext in self._ext2rdr if not set(ext.principals).isdisjoint(principals)} + + # corner-case short-cut + if len(extractors) == 0: + return + + # get readers -> extractors mapping + rdr2ext = defaultdict(set) + for ext in extractors: + rdr = self._ext2rdr[ext] + rdr2ext[rdr].add(ext) + + # create subject for file + uuid = bsfs.uuid.UCID.from_path(path) + subject = node.Node(ns.bsfs.File, self._prefix[uuid]) + + # extract information + for rdr, extrs in rdr2ext.items(): + try: + # get content + content = rdr(path) if rdr is not None else None + + # apply extractors on this content + for ext in extrs: + try: + # get predicate/value tuples + for subject, pred, value in ext.extract(subject, content, principals): + yield subject, pred, value + + except base.errors.ExtractorError as err: + # critical extractor failure. + logger.error('%s failed to extract triples from content: %s', ext, err) + + except base.errors.ReaderError as err: + # failed to read any content. skip. + logger.error('%s failed to read content: %s', rdr, err) + + +## EOF ## diff --git a/bsie/utils/__init__.py b/bsie/utils/__init__.py new file mode 100644 index 0000000..bd22236 --- /dev/null +++ b/bsie/utils/__init__.py @@ -0,0 +1,22 @@ +"""Common tools and definitions. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from . import bsfs +from . import namespaces as ns +from . import node + +# exports +__all__: typing.Sequence[str] = ( + 'bsfs', + 'node', + 'ns', + ) + +## EOF ## diff --git a/bsie/utils/bsfs.py b/bsie/utils/bsfs.py new file mode 100644 index 0000000..0b88479 --- /dev/null +++ b/bsie/utils/bsfs.py @@ -0,0 +1,27 @@ +"""BSFS bridge, provides BSFS bindings for BSIE. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# bsfs imports +from bsfs import Open, schema +from bsfs.apps.init import init_sparql_store +from bsfs.namespace import Namespace +from bsfs.utils import URI, typename, uuid + +# exports +__all__: typing.Sequence[str] = ( + 'Namespace', + 'Open', + 'URI', + 'init_sparql_store', + 'schema', + 'typename', + 'uuid', + ) + +## EOF ## diff --git a/bsie/utils/namespaces.py b/bsie/utils/namespaces.py new file mode 100644 index 0000000..a29fc1b --- /dev/null +++ b/bsie/utils/namespaces.py @@ -0,0 +1,27 @@ +"""Default namespaces used throughout BSIE. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from . import bsfs as _bsfs + +# constants +bse = _bsfs.Namespace('http://bsfs.ai/schema/Entity') +bsfs = _bsfs.Namespace('http://bsfs.ai/schema', fsep='/') +bsm = _bsfs.Namespace('http://bsfs.ai/schema/Meta') +xsd = _bsfs.Namespace('http://www.w3.org/2001/XMLSchema') + +# export +__all__: typing.Sequence[str] = ( + 'bse', + 'bsfs', + 'bsm', + 'xsd', + ) + +## EOF ## diff --git a/bsie/utils/node.py b/bsie/utils/node.py new file mode 100644 index 0000000..ecf39cd --- /dev/null +++ b/bsie/utils/node.py @@ -0,0 +1,53 @@ +"""Lighweight Node to bridge to BSFS. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# bsie imports +from bsie.utils import bsfs + +# exports +__all__: typing.Sequence[str] = ( + 'Node', + ) + + +## code ## + +class Node(): + """Lightweight Node, disconnected from any bsfs structures.""" + + # node type. + node_type: bsfs.URI + + # node URI. + uri: bsfs.URI + + def __init__( + self, + node_type: bsfs.URI, + uri: bsfs.URI, + ): + # assign members + self.node_type = bsfs.URI(node_type) + self.uri = bsfs.URI(uri) + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, Node) \ + and other.node_type == self.node_type \ + and other.uri == self.uri + + def __hash__(self) -> int: + return hash((type(self), self.node_type, self.uri)) + + def __str__(self) -> str: + return f'{bsfs.typename(self)}({self.node_type}, {self.uri})' + + def __repr__(self) -> str: + return f'{bsfs.typename(self)}({self.node_type}, {self.uri})' + +## EOF ## |