aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/apps/index.py
diff options
context:
space:
mode:
authorMatthias Baumgartner <dev@igsor.net>2022-12-14 06:10:25 +0100
committerMatthias Baumgartner <dev@igsor.net>2022-12-14 06:10:25 +0100
commit559e643bb1fa39feefd2eb73847ad9420daf1deb (patch)
tree13243e2e25edd90d16789e6f6e4e473748f347a4 /bsie/apps/index.py
parentedc747252a04675c46059215751719b6666a77f9 (diff)
downloadbsie-559e643bb1fa39feefd2eb73847ad9420daf1deb.tar.gz
bsie-559e643bb1fa39feefd2eb73847ad9420daf1deb.tar.bz2
bsie-559e643bb1fa39feefd2eb73847ad9420daf1deb.zip
bsie extraction and info apps
Diffstat (limited to 'bsie/apps/index.py')
-rw-r--r--bsie/apps/index.py131
1 files changed, 131 insertions, 0 deletions
diff --git a/bsie/apps/index.py b/bsie/apps/index.py
new file mode 100644
index 0000000..821aa4c
--- /dev/null
+++ b/bsie/apps/index.py
@@ -0,0 +1,131 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import argparse
+import os
+import typing
+
+# bsfs imports
+import bsfs
+
+# bsie imports
+from bsie.base import errors
+from bsie.lib.bsie import BSIE
+from bsie.tools import builder
+from bsie.utils.bsfs import URI
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'main',
+ )
+
+
+## code ##
+
+def main(argv):
+ """Index files or directories into BSFS."""
+ parser = argparse.ArgumentParser(description=main.__doc__, prog='index')
+ parser.add_argument('--user', type=URI, default=URI('http://example.com/me'),
+ help='')
+ parser.add_argument('--collect', action='append', default=[],
+ help='')
+ parser.add_argument('--discard', action='append', default=[],
+ help='')
+ parser.add_argument('-r', '--recursive', action='store_true', default=False,
+ help='')
+ parser.add_argument('--follow', action='store_true', default=False,
+ help='')
+ parser.add_argument('--print', action='store_true', default=False,
+ help='')
+ parser.add_argument('input_file', nargs=argparse.REMAINDER,
+ help='')
+ args = parser.parse_args(argv)
+
+ # FIXME: Read reader/extractor configs from a config file
+ # reader builder
+ rbuild = builder.ReaderBuilder({})
+ # extractor builder
+ ebuild = builder.ExtractorBuilder([
+ {'bsie.extractor.generic.path.Path': {}},
+ {'bsie.extractor.generic.stat.Stat': {}},
+ {'bsie.extractor.generic.constant.Constant': dict(
+ tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
+ schema='''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ ''',
+ )},
+ ])
+ # pipeline builder
+ prefix = URI(args.user + ('file#' if args.user.endswith('/') else '/file#'))
+ pbuild = builder.PipelineBuilder(
+ prefix,
+ rbuild,
+ ebuild,
+ )
+
+ # build pipeline
+ pipeline = pbuild.build()
+ # build BSIE frontend
+ bsie = BSIE(pipeline, args.collect, args.discard)
+
+
+ def walk(handle):
+ """Walk through given input files."""
+ # FIXME: collect all triples by node, set all predicates at once
+ # FIXME: simplify code (below but maybe also above)
+ # FIXME: How to handle dependencies between data?
+ # E.g. do I still want to link to a tag despite not being permitted to set its label?
+ # FIXME: node renaming?
+
+ # index input paths
+ for path in args.input_file:
+ if os.path.isdir(path) and args.recursive:
+ for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow):
+ for filename in filenames:
+ for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)):
+ handle(node, pred, value)
+ elif os.path.isfile(path):
+ for node, pred, value in bsie.from_file(path):
+ handle(node, pred, value)
+ else:
+ raise errors.UnreachableError()
+
+
+ if args.print:
+ walk(print)
+ return None
+
+ else:
+ # initialize bsfs
+ # NOTE: With presistent storages, the schema migration will be a seaparte operation.
+ # Here, we'd simply examine the schema and potentially discard more predicates.
+ store = bsfs.Open({
+ 'Graph': {
+ 'user': args.user,
+ 'backend': {
+ 'SparqlStore': {}},
+ }})
+ store.migrate(bsie.schema)
+ # process files
+ def handle(node, pred, value):
+ store.node(node.node_type, node.uri).set(pred.uri, value)
+ walk(handle)
+ # return store
+ return store
+
+
+
+## main ##
+
+if __name__ == '__main__':
+ import sys
+ main(sys.argv[1:])
+
+## EOF ##