aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/apps/index.py
blob: 05218f83408bd7c81a37965d0c2d91a50811a69d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115

# standard imports
import argparse
import typing

# external imports
from tqdm import tqdm

# bsie imports
from bsie.lib import BSIE
from bsie.matcher import nodes, DefaultMatcher
from bsie.utils import bsfs, list_files

# inner-module imports
from . import _loader

# exports
__all__: typing.Sequence[str] = (
    'main',
    )


## code ##

def main(argv):
    """Index files or directories into BSFS."""
    parser = argparse.ArgumentParser(description=main.__doc__, prog='index')
    parser.add_argument('--config', type=str,
        default=_loader.DEFAULT_CONFIG_FILE,
        help='Path to the config file.')
    parser.add_argument('--host', type=bsfs.URI, default=bsfs.URI('http://example.com'),
        help='')
    parser.add_argument('--user', type=str, default='me',
        help='')
    parser.add_argument('--collect', action='append', default=[],
        help='')
    parser.add_argument('--discard', action='append', default=[],
        help='')
    parser.add_argument('-r', '--recursive', action='store_true', default=False,
        help='')
    parser.add_argument('--follow', action='store_true', default=False,
        help='')
    parser.add_argument('--print', action='store_true', default=False,
        help='')
    parser.add_argument('--output', type=str, default=None,
        help='')
    parser.add_argument('input_file', nargs=argparse.REMAINDER,
        help='')
    args = parser.parse_args(argv)

    # build pipeline
    pipeline = _loader.load_pipeline(args.config)
    # build the node matcher
    matcher = DefaultMatcher(
        host=args.host,
        user=args.user,
        )
    # build BSIE frontend
    bsie = BSIE(pipeline, matcher, args.collect, args.discard)

    def walk(handle, status):
        """Walk through given input files."""
        # FIXME: collect all triples by node, set all predicates at once
        # FIXME: How to handle dependencies between data?
        #        E.g. do I still want to link to a tag despite not being permitted to set its label?
        for path in status(list_files(args.input_file, args.recursive, args.follow)):
            for node, pred, value in bsie.from_file(path):
                handle(node, pred, value)

    if args.print:
        def handle(node, pred, value):
            if isinstance(value, nodes.Node):
                value = value.uri
            print(node.uri, pred.uri, value)
        status = lambda x: x
        ret = None

    elif args.output:
        ofile = open(args.output, 'at', encoding='UTF-8')
        def handle(node, pred, value):
            if isinstance(value, nodes.Node):
                value = value.uri
            try:
                ofile.write(f'{node.uri},{pred.uri},{value}\n')
            except Exception as err:
                print(err)
        status = tqdm
        ret = None

    else:
        # initialize bsfs
        # NOTE: With presistent storages, the schema migration will be a seaparte operation.
        # Here, we'd simply examine the schema and potentially discard more predicates.
        store = bsfs.Open(bsfs.init_sparql_store(args.user))
        store.migrate(bsie.schema)
        # process files
        def handle(node, pred, value):
            if isinstance(value, nodes.Node):
                value = store.node(value.node_type, value.uri)
            store.node(node.node_type, node.uri).set(pred.uri, value)
        status=tqdm
        ret = store

    walk(handle, status=status)
    # return store
    return ret


## main ##

if __name__ == '__main__':
    import sys
    main(sys.argv[1:])

## EOF ##