1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
|
"""
Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# standard imports
import argparse
import os
import typing
# bsie imports
from bsie.extractor import ExtractorBuilder
from bsie.lib import BSIE, PipelineBuilder, DefaultNamingPolicy
from bsie.reader import ReaderBuilder
from bsie.utils import bsfs, errors
# exports
__all__: typing.Sequence[str] = (
'main',
)
## code ##
def main(argv):
"""Index files or directories into BSFS."""
parser = argparse.ArgumentParser(description=main.__doc__, prog='index')
parser.add_argument('--host', type=bsfs.URI, default=bsfs.URI('http://example.com'),
help='')
parser.add_argument('--user', type=str, default='me',
help='')
parser.add_argument('--collect', action='append', default=[],
help='')
parser.add_argument('--discard', action='append', default=[],
help='')
parser.add_argument('-r', '--recursive', action='store_true', default=False,
help='')
parser.add_argument('--follow', action='store_true', default=False,
help='')
parser.add_argument('--print', action='store_true', default=False,
help='')
parser.add_argument('input_file', nargs=argparse.REMAINDER,
help='')
args = parser.parse_args(argv)
# FIXME: Read reader/extractor configs from a config file
# reader builder
rbuild = ReaderBuilder()
# extractor builder
ebuild = ExtractorBuilder([
{'bsie.extractor.generic.path.Path': {}},
{'bsie.extractor.generic.stat.Stat': {}},
{'bsie.extractor.generic.constant.Constant': dict(
tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
schema='''
bse:author rdfs:subClassOf bsfs:Predicate ;
rdfs:domain bsfs:Entity ;
rdfs:range xsd:string ;
bsfs:unique "true"^^xsd:boolean .
''',
)},
{'bsie.extractor.image.colors_spatial.ColorsSpatial': {
'width': 2,
'height': 2,
'exp': 2,
}},
])
# pipeline builder
pbuild = PipelineBuilder(
rbuild,
ebuild,
)
# build pipeline
pipeline = pbuild.build()
# build the naming policy
naming_policy = DefaultNamingPolicy(
host=args.host,
user=args.user,
)
# build BSIE frontend
bsie = BSIE(pipeline, naming_policy, args.collect, args.discard)
def walk(handle):
"""Walk through given input files."""
# FIXME: collect all triples by node, set all predicates at once
# FIXME: simplify code (below but maybe also above)
# FIXME: How to handle dependencies between data?
# E.g. do I still want to link to a tag despite not being permitted to set its label?
# index input paths
for path in args.input_file:
if not os.path.exists(path):
pass # FIXME: notify the user
elif os.path.isdir(path) and args.recursive:
for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow):
for filename in filenames:
for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)):
handle(node, pred, value)
elif os.path.isfile(path):
for node, pred, value in bsie.from_file(path):
handle(node, pred, value)
else:
raise errors.UnreachableError()
if args.print:
walk(print)
return None
# initialize bsfs
# NOTE: With presistent storages, the schema migration will be a seaparte operation.
# Here, we'd simply examine the schema and potentially discard more predicates.
store = bsfs.Open(bsfs.init_sparql_store(args.user))
store.migrate(bsie.schema)
# process files
def handle(node, pred, value):
store.node(node.node_type, node.uri).set(pred.uri, value)
walk(handle)
# return store
return store
## main ##
if __name__ == '__main__':
import sys
main(sys.argv[1:])
## EOF ##
|