aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/apps/index.py
blob: 821aa4c48c53c6ee58c0ca4d01411506a8e4f189 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""

Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# imports
import argparse
import os
import typing

# bsfs imports
import bsfs

# bsie imports
from bsie.base import errors
from bsie.lib.bsie import BSIE
from bsie.tools import builder
from bsie.utils.bsfs import URI

# exports
__all__: typing.Sequence[str] = (
    'main',
    )


## code ##

def main(argv):
    """Index files or directories into BSFS."""
    parser = argparse.ArgumentParser(description=main.__doc__, prog='index')
    parser.add_argument('--user', type=URI, default=URI('http://example.com/me'),
        help='')
    parser.add_argument('--collect', action='append', default=[],
        help='')
    parser.add_argument('--discard', action='append', default=[],
        help='')
    parser.add_argument('-r', '--recursive', action='store_true', default=False,
        help='')
    parser.add_argument('--follow', action='store_true', default=False,
        help='')
    parser.add_argument('--print', action='store_true', default=False,
        help='')
    parser.add_argument('input_file', nargs=argparse.REMAINDER,
        help='')
    args = parser.parse_args(argv)

    # FIXME: Read reader/extractor configs from a config file
    # reader builder
    rbuild = builder.ReaderBuilder({})
    # extractor builder
    ebuild = builder.ExtractorBuilder([
        {'bsie.extractor.generic.path.Path': {}},
        {'bsie.extractor.generic.stat.Stat': {}},
        {'bsie.extractor.generic.constant.Constant': dict(
            tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
            schema='''
                bse:author rdfs:subClassOf bsfs:Predicate ;
                    rdfs:domain bsfs:Entity ;
                    rdfs:range xsd:string ;
                    bsfs:unique "true"^^xsd:boolean .
                ''',
            )},
        ])
    # pipeline builder
    prefix = URI(args.user + ('file#' if args.user.endswith('/') else '/file#'))
    pbuild = builder.PipelineBuilder(
        prefix,
        rbuild,
        ebuild,
        )

    # build pipeline
    pipeline = pbuild.build()
    # build BSIE frontend
    bsie = BSIE(pipeline, args.collect, args.discard)


    def walk(handle):
        """Walk through given input files."""
        # FIXME: collect all triples by node, set all predicates at once
        # FIXME: simplify code (below but maybe also above)
        # FIXME: How to handle dependencies between data?
        #        E.g. do I still want to link to a tag despite not being permitted to set its label?
        # FIXME: node renaming?

        # index input paths
        for path in args.input_file:
            if os.path.isdir(path) and args.recursive:
                for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow):
                    for filename in filenames:
                        for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)):
                            handle(node, pred, value)
            elif os.path.isfile(path):
                for node, pred, value in bsie.from_file(path):
                    handle(node, pred, value)
            else:
                raise errors.UnreachableError()


    if args.print:
        walk(print)
        return None

    else:
        # initialize bsfs
        # NOTE: With presistent storages, the schema migration will be a seaparte operation.
        # Here, we'd simply examine the schema and potentially discard more predicates.
        store = bsfs.Open({
            'Graph': {
                'user': args.user,
                'backend': {
                    'SparqlStore': {}},
                }})
        store.migrate(bsie.schema)
        # process files
        def handle(node, pred, value):
            store.node(node.node_type, node.uri).set(pred.uri, value)
        walk(handle)
        # return store
        return store



## main ##

if __name__ == '__main__':
    import sys
    main(sys.argv[1:])

## EOF ##