1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
|
"""
Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# imports
import argparse
import os
import typing
# bsie imports
from bsie.base import errors
from bsie.lib import BSIE
from bsie.tools import builder
from bsie.utils import bsfs
# exports
__all__: typing.Sequence[str] = (
'main',
)
## code ##
def main(argv):
"""Index files or directories into BSFS."""
parser = argparse.ArgumentParser(description=main.__doc__, prog='index')
parser.add_argument('--user', type=URI, default=URI('http://example.com/me'),
help='')
parser.add_argument('--collect', action='append', default=[],
help='')
parser.add_argument('--discard', action='append', default=[],
help='')
parser.add_argument('-r', '--recursive', action='store_true', default=False,
help='')
parser.add_argument('--follow', action='store_true', default=False,
help='')
parser.add_argument('--print', action='store_true', default=False,
help='')
parser.add_argument('input_file', nargs=argparse.REMAINDER,
help='')
args = parser.parse_args(argv)
# FIXME: Read reader/extractor configs from a config file
# reader builder
rbuild = builder.ReaderBuilder({})
# extractor builder
ebuild = builder.ExtractorBuilder([
{'bsie.extractor.generic.path.Path': {}},
{'bsie.extractor.generic.stat.Stat': {}},
{'bsie.extractor.generic.constant.Constant': dict(
tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
schema='''
bse:author rdfs:subClassOf bsfs:Predicate ;
rdfs:domain bsfs:Entity ;
rdfs:range xsd:string ;
bsfs:unique "true"^^xsd:boolean .
''',
)},
])
# pipeline builder
prefix = URI(args.user + ('file#' if args.user.endswith('/') else '/file#'))
pbuild = builder.PipelineBuilder(
prefix,
rbuild,
ebuild,
)
# build pipeline
pipeline = pbuild.build()
# build BSIE frontend
bsie = BSIE(pipeline, args.collect, args.discard)
def walk(handle):
"""Walk through given input files."""
# FIXME: collect all triples by node, set all predicates at once
# FIXME: simplify code (below but maybe also above)
# FIXME: How to handle dependencies between data?
# E.g. do I still want to link to a tag despite not being permitted to set its label?
# FIXME: node renaming?
# index input paths
for path in args.input_file:
if os.path.isdir(path) and args.recursive:
for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow):
for filename in filenames:
for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)):
handle(node, pred, value)
elif os.path.isfile(path):
for node, pred, value in bsie.from_file(path):
handle(node, pred, value)
else:
raise errors.UnreachableError()
if args.print:
walk(print)
return None
else:
# initialize bsfs
# NOTE: With presistent storages, the schema migration will be a seaparte operation.
# Here, we'd simply examine the schema and potentially discard more predicates.
store = bsfs.Open({
'Graph': {
'user': args.user,
'backend': {
'SparqlStore': {}},
}})
store.migrate(bsie.schema)
# process files
def handle(node, pred, value):
store.node(node.node_type, node.uri).set(pred.uri, value)
walk(handle)
# return store
return store
## main ##
if __name__ == '__main__':
import sys
main(sys.argv[1:])
## EOF ##
|