aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/apps/index.py
blob: 8798c490f7b75bacbd22ef4a9d567b0705e88e38 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""

Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# standard imports
import argparse
import os
import typing

# bsie imports
from bsie.extractor import ExtractorBuilder
from bsie.lib import BSIE, PipelineBuilder, DefaultNamingPolicy
from bsie.reader import ReaderBuilder
from bsie.utils import bsfs, errors, node as node_

# exports
__all__: typing.Sequence[str] = (
    'main',
    )


## code ##

def main(argv):
    """Index files or directories into BSFS."""
    parser = argparse.ArgumentParser(description=main.__doc__, prog='index')
    parser.add_argument('--host', type=bsfs.URI, default=bsfs.URI('http://example.com'),
        help='')
    parser.add_argument('--user', type=str, default='me',
        help='')
    parser.add_argument('--collect', action='append', default=[],
        help='')
    parser.add_argument('--discard', action='append', default=[],
        help='')
    parser.add_argument('-r', '--recursive', action='store_true', default=False,
        help='')
    parser.add_argument('--follow', action='store_true', default=False,
        help='')
    parser.add_argument('--print', action='store_true', default=False,
        help='')
    parser.add_argument('input_file', nargs=argparse.REMAINDER,
        help='')
    args = parser.parse_args(argv)

    # FIXME: Read reader/extractor configs from a config file
    # reader builder
    rbuild = ReaderBuilder()
    # extractor builder
    ebuild = ExtractorBuilder([
        {'bsie.extractor.preview.Preview': {
            'max_sides': [50],
            }},
        {'bsie.extractor.generic.path.Path': {}},
        {'bsie.extractor.generic.stat.Stat': {}},
        {'bsie.extractor.generic.constant.Constant': dict(
            tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
            schema='''
                bse:author rdfs:subClassOf bsfs:Predicate ;
                    rdfs:domain bsfs:Entity ;
                    rdfs:range xsd:string ;
                    bsfs:unique "true"^^xsd:boolean .
                ''',
            )},
        {'bsie.extractor.image.colors_spatial.ColorsSpatial': {
            'width': 2,
            'height': 2,
            'exp': 2,
            }},
        ])
    # pipeline builder
    pbuild = PipelineBuilder(
        rbuild,
        ebuild,
        )

    # build pipeline
    pipeline = pbuild.build()
    # build the naming policy
    naming_policy = DefaultNamingPolicy(
        host=args.host,
        user=args.user,
        )
    # build BSIE frontend
    bsie = BSIE(pipeline, naming_policy, args.collect, args.discard)

    def walk(handle):
        """Walk through given input files."""
        # FIXME: collect all triples by node, set all predicates at once
        # FIXME: simplify code (below but maybe also above)
        # FIXME: How to handle dependencies between data?
        #        E.g. do I still want to link to a tag despite not being permitted to set its label?

        # index input paths
        for path in args.input_file:
            if not os.path.exists(path):
                pass # FIXME: notify the user
            elif os.path.isdir(path) and args.recursive:
                for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow):
                    for filename in filenames:
                        for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)):
                            handle(node, pred, value)
            elif os.path.isfile(path):
                for node, pred, value in bsie.from_file(path):
                    handle(node, pred, value)
            else:
                raise errors.UnreachableError()


    if args.print:
        walk(print)
        return None

    # initialize bsfs
    # NOTE: With presistent storages, the schema migration will be a seaparte operation.
    # Here, we'd simply examine the schema and potentially discard more predicates.
    store = bsfs.Open(bsfs.init_sparql_store(args.user))
    store.migrate(bsie.schema)
    # process files
    def handle(node, pred, value):
        if isinstance(value, node_.Node):
            value = store.node(value.node_type, value.uri)
        store.node(node.node_type, node.uri).set(pred.uri, value)
    walk(handle)
    # return store
    return store



## main ##

if __name__ == '__main__':
    import sys
    main(sys.argv[1:])

## EOF ##