1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
"""
Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# imports
import argparse
import os
import typing
# bsfs imports
import bsfs
# bsie imports
from bsie.base import errors
from bsie.lib.bsie import BSIE
from bsie.tools import builder
from bsie.utils.bsfs import URI
# exports
__all__: typing.Sequence[str] = (
'main',
)
## code ##
def main(argv):
"""Index files or directories into BSFS."""
parser = argparse.ArgumentParser(description=main.__doc__, prog='index')
parser.add_argument('--user', type=URI, default=URI('http://example.com/me'),
help='')
parser.add_argument('--collect', action='append', default=[],
help='')
parser.add_argument('--discard', action='append', default=[],
help='')
parser.add_argument('-r', '--recursive', action='store_true', default=False,
help='')
parser.add_argument('--follow', action='store_true', default=False,
help='')
parser.add_argument('--print', action='store_true', default=False,
help='')
parser.add_argument('input_file', nargs=argparse.REMAINDER,
help='')
args = parser.parse_args(argv)
# FIXME: Read reader/extractor configs from a config file
# reader builder
rbuild = builder.ReaderBuilder({})
# extractor builder
ebuild = builder.ExtractorBuilder([
{'bsie.extractor.generic.path.Path': {}},
{'bsie.extractor.generic.stat.Stat': {}},
{'bsie.extractor.generic.constant.Constant': dict(
tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
schema='''
bse:author rdfs:subClassOf bsfs:Predicate ;
rdfs:domain bsfs:Entity ;
rdfs:range xsd:string ;
bsfs:unique "true"^^xsd:boolean .
''',
)},
])
# pipeline builder
prefix = URI(args.user + ('file#' if args.user.endswith('/') else '/file#'))
pbuild = builder.PipelineBuilder(
prefix,
rbuild,
ebuild,
)
# build pipeline
pipeline = pbuild.build()
# build BSIE frontend
bsie = BSIE(pipeline, args.collect, args.discard)
def walk(handle):
"""Walk through given input files."""
# FIXME: collect all triples by node, set all predicates at once
# FIXME: simplify code (below but maybe also above)
# FIXME: How to handle dependencies between data?
# E.g. do I still want to link to a tag despite not being permitted to set its label?
# FIXME: node renaming?
# index input paths
for path in args.input_file:
if os.path.isdir(path) and args.recursive:
for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow):
for filename in filenames:
for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)):
handle(node, pred, value)
elif os.path.isfile(path):
for node, pred, value in bsie.from_file(path):
handle(node, pred, value)
else:
raise errors.UnreachableError()
if args.print:
walk(print)
return None
else:
# initialize bsfs
# NOTE: With presistent storages, the schema migration will be a seaparte operation.
# Here, we'd simply examine the schema and potentially discard more predicates.
store = bsfs.Open({
'Graph': {
'user': args.user,
'backend': {
'SparqlStore': {}},
}})
store.migrate(bsie.schema)
# process files
def handle(node, pred, value):
store.node(node.node_type, node.uri).set(pred.uri, value)
walk(handle)
# return store
return store
## main ##
if __name__ == '__main__':
import sys
main(sys.argv[1:])
## EOF ##
|