""" Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ # standard imports import argparse import os import typing # bsie imports from bsie.extractor import ExtractorBuilder from bsie.lib import BSIE, PipelineBuilder, DefaultNamingPolicy from bsie.reader import ReaderBuilder from bsie.utils import bsfs, errors, node as node_ # exports __all__: typing.Sequence[str] = ( 'main', ) ## code ## def main(argv): """Index files or directories into BSFS.""" parser = argparse.ArgumentParser(description=main.__doc__, prog='index') parser.add_argument('--host', type=bsfs.URI, default=bsfs.URI('http://example.com'), help='') parser.add_argument('--user', type=str, default='me', help='') parser.add_argument('--collect', action='append', default=[], help='') parser.add_argument('--discard', action='append', default=[], help='') parser.add_argument('-r', '--recursive', action='store_true', default=False, help='') parser.add_argument('--follow', action='store_true', default=False, help='') parser.add_argument('--print', action='store_true', default=False, help='') parser.add_argument('input_file', nargs=argparse.REMAINDER, help='') args = parser.parse_args(argv) # FIXME: Read reader/extractor configs from a config file # reader builder rbuild = ReaderBuilder() # extractor builder ebuild = ExtractorBuilder([ {'bsie.extractor.preview.Preview': { 'max_sides': [50], }}, {'bsie.extractor.generic.path.Path': {}}, {'bsie.extractor.generic.stat.Stat': {}}, {'bsie.extractor.generic.constant.Constant': dict( tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')], schema=''' bse:author rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:Entity ; rdfs:range xsd:string ; bsfs:unique "true"^^xsd:boolean . ''', )}, {'bsie.extractor.image.colors_spatial.ColorsSpatial': { 'width': 2, 'height': 2, 'exp': 2, }}, ]) # pipeline builder pbuild = PipelineBuilder( rbuild, ebuild, ) # build pipeline pipeline = pbuild.build() # build the naming policy naming_policy = DefaultNamingPolicy( host=args.host, user=args.user, ) # build BSIE frontend bsie = BSIE(pipeline, naming_policy, args.collect, args.discard) def walk(handle): """Walk through given input files.""" # FIXME: collect all triples by node, set all predicates at once # FIXME: simplify code (below but maybe also above) # FIXME: How to handle dependencies between data? # E.g. do I still want to link to a tag despite not being permitted to set its label? # index input paths for path in args.input_file: if not os.path.exists(path): pass # FIXME: notify the user elif os.path.isdir(path) and args.recursive: for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow): for filename in filenames: for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)): handle(node, pred, value) elif os.path.isfile(path): for node, pred, value in bsie.from_file(path): handle(node, pred, value) else: raise errors.UnreachableError() if args.print: walk(print) return None # initialize bsfs # NOTE: With presistent storages, the schema migration will be a seaparte operation. # Here, we'd simply examine the schema and potentially discard more predicates. store = bsfs.Open(bsfs.init_sparql_store(args.user)) store.migrate(bsie.schema) # process files def handle(node, pred, value): if isinstance(value, node_.Node): value = store.node(value.node_type, value.uri) store.node(node.node_type, node.uri).set(pred.uri, value) walk(handle) # return store return store ## main ## if __name__ == '__main__': import sys main(sys.argv[1:]) ## EOF ##