diff options
Diffstat (limited to 'bsie/apps/index.py')
-rw-r--r-- | bsie/apps/index.py | 131 |
1 files changed, 131 insertions, 0 deletions
diff --git a/bsie/apps/index.py b/bsie/apps/index.py new file mode 100644 index 0000000..821aa4c --- /dev/null +++ b/bsie/apps/index.py @@ -0,0 +1,131 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import argparse +import os +import typing + +# bsfs imports +import bsfs + +# bsie imports +from bsie.base import errors +from bsie.lib.bsie import BSIE +from bsie.tools import builder +from bsie.utils.bsfs import URI + +# exports +__all__: typing.Sequence[str] = ( + 'main', + ) + + +## code ## + +def main(argv): + """Index files or directories into BSFS.""" + parser = argparse.ArgumentParser(description=main.__doc__, prog='index') + parser.add_argument('--user', type=URI, default=URI('http://example.com/me'), + help='') + parser.add_argument('--collect', action='append', default=[], + help='') + parser.add_argument('--discard', action='append', default=[], + help='') + parser.add_argument('-r', '--recursive', action='store_true', default=False, + help='') + parser.add_argument('--follow', action='store_true', default=False, + help='') + parser.add_argument('--print', action='store_true', default=False, + help='') + parser.add_argument('input_file', nargs=argparse.REMAINDER, + help='') + args = parser.parse_args(argv) + + # FIXME: Read reader/extractor configs from a config file + # reader builder + rbuild = builder.ReaderBuilder({}) + # extractor builder + ebuild = builder.ExtractorBuilder([ + {'bsie.extractor.generic.path.Path': {}}, + {'bsie.extractor.generic.stat.Stat': {}}, + {'bsie.extractor.generic.constant.Constant': dict( + tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')], + schema=''' + bse:author rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:Entity ; + rdfs:range xsd:string ; + bsfs:unique "true"^^xsd:boolean . + ''', + )}, + ]) + # pipeline builder + prefix = URI(args.user + ('file#' if args.user.endswith('/') else '/file#')) + pbuild = builder.PipelineBuilder( + prefix, + rbuild, + ebuild, + ) + + # build pipeline + pipeline = pbuild.build() + # build BSIE frontend + bsie = BSIE(pipeline, args.collect, args.discard) + + + def walk(handle): + """Walk through given input files.""" + # FIXME: collect all triples by node, set all predicates at once + # FIXME: simplify code (below but maybe also above) + # FIXME: How to handle dependencies between data? + # E.g. do I still want to link to a tag despite not being permitted to set its label? + # FIXME: node renaming? + + # index input paths + for path in args.input_file: + if os.path.isdir(path) and args.recursive: + for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow): + for filename in filenames: + for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)): + handle(node, pred, value) + elif os.path.isfile(path): + for node, pred, value in bsie.from_file(path): + handle(node, pred, value) + else: + raise errors.UnreachableError() + + + if args.print: + walk(print) + return None + + else: + # initialize bsfs + # NOTE: With presistent storages, the schema migration will be a seaparte operation. + # Here, we'd simply examine the schema and potentially discard more predicates. + store = bsfs.Open({ + 'Graph': { + 'user': args.user, + 'backend': { + 'SparqlStore': {}}, + }}) + store.migrate(bsie.schema) + # process files + def handle(node, pred, value): + store.node(node.node_type, node.uri).set(pred.uri, value) + walk(handle) + # return store + return store + + + +## main ## + +if __name__ == '__main__': + import sys + main(sys.argv[1:]) + +## EOF ## |