diff options
Diffstat (limited to 'bsie')
-rw-r--r-- | bsie/__init__.py | 6 | ||||
-rw-r--r-- | bsie/apps/__init__.py | 20 | ||||
-rw-r--r-- | bsie/apps/index.py | 131 | ||||
-rw-r--r-- | bsie/apps/info.py | 74 | ||||
-rw-r--r-- | bsie/base/errors.py | 6 | ||||
-rw-r--r-- | bsie/lib/__init__.py | 13 | ||||
-rw-r--r-- | bsie/lib/bsie.py | 80 | ||||
-rw-r--r-- | bsie/tools/pipeline.py | 4 | ||||
-rw-r--r-- | bsie/utils/namespaces.py | 2 |
9 files changed, 335 insertions, 1 deletions
diff --git a/bsie/__init__.py b/bsie/__init__.py index 2f2477a..2b874bd 100644 --- a/bsie/__init__.py +++ b/bsie/__init__.py @@ -5,8 +5,14 @@ A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ # imports +import collections import typing +# constants +version_info = collections.namedtuple('version_info', + ('major', 'minor', 'micro')) \ + (0, 0, 1) + # exports __all__: typing.Sequence[str] = [] diff --git a/bsie/apps/__init__.py b/bsie/apps/__init__.py new file mode 100644 index 0000000..a548c3c --- /dev/null +++ b/bsie/apps/__init__.py @@ -0,0 +1,20 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from .index import main as index +from .info import main as info + +# exports +__all__: typing.Sequence[str] = ( + 'index', + 'info', + ) + +## EOF ## diff --git a/bsie/apps/index.py b/bsie/apps/index.py new file mode 100644 index 0000000..821aa4c --- /dev/null +++ b/bsie/apps/index.py @@ -0,0 +1,131 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import argparse +import os +import typing + +# bsfs imports +import bsfs + +# bsie imports +from bsie.base import errors +from bsie.lib.bsie import BSIE +from bsie.tools import builder +from bsie.utils.bsfs import URI + +# exports +__all__: typing.Sequence[str] = ( + 'main', + ) + + +## code ## + +def main(argv): + """Index files or directories into BSFS.""" + parser = argparse.ArgumentParser(description=main.__doc__, prog='index') + parser.add_argument('--user', type=URI, default=URI('http://example.com/me'), + help='') + parser.add_argument('--collect', action='append', default=[], + help='') + parser.add_argument('--discard', action='append', default=[], + help='') + parser.add_argument('-r', '--recursive', action='store_true', default=False, + help='') + parser.add_argument('--follow', action='store_true', default=False, + help='') + parser.add_argument('--print', action='store_true', default=False, + help='') + parser.add_argument('input_file', nargs=argparse.REMAINDER, + help='') + args = parser.parse_args(argv) + + # FIXME: Read reader/extractor configs from a config file + # reader builder + rbuild = builder.ReaderBuilder({}) + # extractor builder + ebuild = builder.ExtractorBuilder([ + {'bsie.extractor.generic.path.Path': {}}, + {'bsie.extractor.generic.stat.Stat': {}}, + {'bsie.extractor.generic.constant.Constant': dict( + tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')], + schema=''' + bse:author rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + bsfs:unique "true"^^xsd:boolean . + ''', + )}, + ]) + # pipeline builder + prefix = URI(args.user + ('file#' if args.user.endswith('/') else '/file#')) + pbuild = builder.PipelineBuilder( + prefix, + rbuild, + ebuild, + ) + + # build pipeline + pipeline = pbuild.build() + # build BSIE frontend + bsie = BSIE(pipeline, args.collect, args.discard) + + + def walk(handle): + """Walk through given input files.""" + # FIXME: collect all triples by node, set all predicates at once + # FIXME: simplify code (below but maybe also above) + # FIXME: How to handle dependencies between data? + # E.g. do I still want to link to a tag despite not being permitted to set its label? + # FIXME: node renaming? + + # index input paths + for path in args.input_file: + if os.path.isdir(path) and args.recursive: + for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow): + for filename in filenames: + for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)): + handle(node, pred, value) + elif os.path.isfile(path): + for node, pred, value in bsie.from_file(path): + handle(node, pred, value) + else: + raise errors.UnreachableError() + + + if args.print: + walk(print) + return None + + else: + # initialize bsfs + # NOTE: With presistent storages, the schema migration will be a seaparte operation. + # Here, we'd simply examine the schema and potentially discard more predicates. + store = bsfs.Open({ + 'Graph': { + 'user': args.user, + 'backend': { + 'SparqlStore': {}}, + }}) + store.migrate(bsie.schema) + # process files + def handle(node, pred, value): + store.node(node.node_type, node.uri).set(pred.uri, value) + walk(handle) + # return store + return store + + + +## main ## + +if __name__ == '__main__': + import sys + main(sys.argv[1:]) + +## EOF ## diff --git a/bsie/apps/info.py b/bsie/apps/info.py new file mode 100644 index 0000000..8cc6dca --- /dev/null +++ b/bsie/apps/info.py @@ -0,0 +1,74 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import argparse +import sys +import typing + +# bsie imports +from bsie.base import errors +from bsie.tools import builder +from bsie.utils.bsfs import URI + +# exports +__all__: typing.Sequence[str] = ( + 'main', + ) + + +## code ## + +def main(argv): + """Show information from BSIE.""" + parser = argparse.ArgumentParser(description=main.__doc__, prog='info') + parser.add_argument('what', choices=('predicates', ), + help='Select what information to show.') + args = parser.parse_args(argv) + + # FIXME: Read reader/extractor configs from a config file + # reader builder + rbuild = builder.ReaderBuilder({}) + # extractor builder + ebuild = builder.ExtractorBuilder([ + {'bsie.extractor.generic.path.Path': {}}, + {'bsie.extractor.generic.stat.Stat': {}}, + {'bsie.extractor.generic.constant.Constant': dict( + tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')], + schema=''' + bse:author rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + bsfs:unique "true"^^xsd:boolean . + ''', + )}, + ]) + # pipeline builder + pbuild = builder.PipelineBuilder( + URI('http://example.com/me/file#'), # not actually used + rbuild, + ebuild, + ) + + # build pipeline + pipeline = pbuild.build() + + # show info + if args.what == 'predicates': + # show predicates + for pred in pipeline.schema.predicates(): + print(pred.uri) + else: + # args.what is already checked by argparse + raise errors.UnreachableError() + + +## main ## + +if __name__ == '__main__': + main(sys.argv[1:]) + +## EOF ## diff --git a/bsie/base/errors.py b/bsie/base/errors.py index 760351f..dc3c30e 100644 --- a/bsie/base/errors.py +++ b/bsie/base/errors.py @@ -33,4 +33,10 @@ class ExtractorError(_BSIEError): class ReaderError(_BSIEError): """The Reader failed to read the given file.""" +class ProgrammingError(_BSIEError): + """An assertion-like error that indicates a code-base issue.""" + +class UnreachableError(ProgrammingError): + """Bravo, you've reached a point in code that should logically not be reachable.""" + ## EOF ## diff --git a/bsie/lib/__init__.py b/bsie/lib/__init__.py new file mode 100644 index 0000000..f6c9018 --- /dev/null +++ b/bsie/lib/__init__.py @@ -0,0 +1,13 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# exports +__all__: typing.Sequence[str] = [] + +## EOF ## diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py new file mode 100644 index 0000000..aeccc8c --- /dev/null +++ b/bsie/lib/bsie.py @@ -0,0 +1,80 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# bsie imports +from bsie.tools.pipeline import Pipeline +from bsie.utils import node, ns +from bsie.utils.bsfs import URI, schema as schema_ + +# exports +__all__: typing.Sequence[str] = ( + 'BSIE', + ) + + +## code ## + +class BSIE(): + """Extract triples from files. + + Controls which predicates to extract (*collect*) and + which to not extract (*discard*). Note that this only affects + principal predicates not auxililary predicates like, e.g., tag labels. + + """ + + # predicates to extract. + predicates: typing.Set[URI] + + # local schema. + schema: schema_.Schema + + def __init__( + self, + # pipeline builder. + pipeline: Pipeline, + # predicates to extract at most. None implies all available w.r.t. extractors. + collect: typing.Optional[typing.Iterable[URI]] = None, + # predicates to discard. + discard: typing.Optional[typing.Iterable[URI]] = None, + ): + # store pipeline + self.pipeline = pipeline + # start off with available predicates + self.predicates = {pred.uri for pred in self.pipeline.predicates()} + # limit predicates to specified ones by argument. + if collect is not None: + collect = set(collect) + if len(collect) > 0: + self.predicates &= collect + # discard predicates. + if discard is not None: + self.predicates -= set(discard) + # discard ns.bsfs.Predicate + self.predicates.discard(ns.bsfs.Predicate) + # compile a schema that only contains the requested predicates (and implied types) + self.schema = schema_.Schema({ + self.pipeline.schema.predicate(pred) for pred in self.predicates}) + + def from_file( + self, + path: URI, + predicates: typing.Optional[typing.Iterable[URI]] = None, + ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]: + """Produce triples for a given *path*. Limit to *predicates* if given.""" + # get requested predicates. + predicates = set(predicates) if predicates is not None else self.predicates + # filter through requested predicates. + predicates &= self.predicates + # predicate lookup + predicates = {self.schema.predicate(pred) for pred in predicates} + # invoke pipeline + yield from self.pipeline(path, predicates) + +## EOF ## diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py index 8e1c992..da422c0 100644 --- a/bsie/tools/pipeline.py +++ b/bsie/tools/pipeline.py @@ -70,6 +70,10 @@ class Pipeline(): and self._prefix == other._prefix \ and self._ext2rdr == other._ext2rdr + def predicates(self) -> typing.Iterator[_schema.Predicate]: + """Return the predicates that are extracted from a file.""" + return iter({pred for ext in self._ext2rdr for pred in ext.predicates()}) + def __call__( self, path: URI, diff --git a/bsie/utils/namespaces.py b/bsie/utils/namespaces.py index 13be96b..2fcb2dc 100644 --- a/bsie/utils/namespaces.py +++ b/bsie/utils/namespaces.py @@ -13,7 +13,7 @@ from . import bsfs as _bsfs # constants bse = _bsfs.Namespace('http://bsfs.ai/schema/Entity#') bsfs = _bsfs.Namespace('http://bsfs.ai/schema/') -bsm = _bsfs.Namespace('http://bsfs.ai/schema/meta#') +bsm = _bsfs.Namespace('http://bsfs.ai/schema/Meta#') xsd = _bsfs.Namespace('http://www.w3.org/2001/XMLSchema#') # export |