aboutsummaryrefslogtreecommitdiffstats
path: root/bsie
diff options
context:
space:
mode:
Diffstat (limited to 'bsie')
-rw-r--r--bsie/__init__.py6
-rw-r--r--bsie/apps/__init__.py20
-rw-r--r--bsie/apps/index.py131
-rw-r--r--bsie/apps/info.py74
-rw-r--r--bsie/base/errors.py6
-rw-r--r--bsie/lib/__init__.py13
-rw-r--r--bsie/lib/bsie.py80
-rw-r--r--bsie/tools/pipeline.py4
-rw-r--r--bsie/utils/namespaces.py2
9 files changed, 335 insertions, 1 deletions
diff --git a/bsie/__init__.py b/bsie/__init__.py
index 2f2477a..2b874bd 100644
--- a/bsie/__init__.py
+++ b/bsie/__init__.py
@@ -5,8 +5,14 @@ A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# imports
+import collections
import typing
+# constants
+version_info = collections.namedtuple('version_info',
+ ('major', 'minor', 'micro')) \
+ (0, 0, 1)
+
# exports
__all__: typing.Sequence[str] = []
diff --git a/bsie/apps/__init__.py b/bsie/apps/__init__.py
new file mode 100644
index 0000000..a548c3c
--- /dev/null
+++ b/bsie/apps/__init__.py
@@ -0,0 +1,20 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from .index import main as index
+from .info import main as info
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'index',
+ 'info',
+ )
+
+## EOF ##
diff --git a/bsie/apps/index.py b/bsie/apps/index.py
new file mode 100644
index 0000000..821aa4c
--- /dev/null
+++ b/bsie/apps/index.py
@@ -0,0 +1,131 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import argparse
+import os
+import typing
+
+# bsfs imports
+import bsfs
+
+# bsie imports
+from bsie.base import errors
+from bsie.lib.bsie import BSIE
+from bsie.tools import builder
+from bsie.utils.bsfs import URI
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'main',
+ )
+
+
+## code ##
+
+def main(argv):
+ """Index files or directories into BSFS."""
+ parser = argparse.ArgumentParser(description=main.__doc__, prog='index')
+ parser.add_argument('--user', type=URI, default=URI('http://example.com/me'),
+ help='')
+ parser.add_argument('--collect', action='append', default=[],
+ help='')
+ parser.add_argument('--discard', action='append', default=[],
+ help='')
+ parser.add_argument('-r', '--recursive', action='store_true', default=False,
+ help='')
+ parser.add_argument('--follow', action='store_true', default=False,
+ help='')
+ parser.add_argument('--print', action='store_true', default=False,
+ help='')
+ parser.add_argument('input_file', nargs=argparse.REMAINDER,
+ help='')
+ args = parser.parse_args(argv)
+
+ # FIXME: Read reader/extractor configs from a config file
+ # reader builder
+ rbuild = builder.ReaderBuilder({})
+ # extractor builder
+ ebuild = builder.ExtractorBuilder([
+ {'bsie.extractor.generic.path.Path': {}},
+ {'bsie.extractor.generic.stat.Stat': {}},
+ {'bsie.extractor.generic.constant.Constant': dict(
+ tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
+ schema='''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ ''',
+ )},
+ ])
+ # pipeline builder
+ prefix = URI(args.user + ('file#' if args.user.endswith('/') else '/file#'))
+ pbuild = builder.PipelineBuilder(
+ prefix,
+ rbuild,
+ ebuild,
+ )
+
+ # build pipeline
+ pipeline = pbuild.build()
+ # build BSIE frontend
+ bsie = BSIE(pipeline, args.collect, args.discard)
+
+
+ def walk(handle):
+ """Walk through given input files."""
+ # FIXME: collect all triples by node, set all predicates at once
+ # FIXME: simplify code (below but maybe also above)
+ # FIXME: How to handle dependencies between data?
+ # E.g. do I still want to link to a tag despite not being permitted to set its label?
+ # FIXME: node renaming?
+
+ # index input paths
+ for path in args.input_file:
+ if os.path.isdir(path) and args.recursive:
+ for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow):
+ for filename in filenames:
+ for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)):
+ handle(node, pred, value)
+ elif os.path.isfile(path):
+ for node, pred, value in bsie.from_file(path):
+ handle(node, pred, value)
+ else:
+ raise errors.UnreachableError()
+
+
+ if args.print:
+ walk(print)
+ return None
+
+ else:
+ # initialize bsfs
+ # NOTE: With presistent storages, the schema migration will be a seaparte operation.
+ # Here, we'd simply examine the schema and potentially discard more predicates.
+ store = bsfs.Open({
+ 'Graph': {
+ 'user': args.user,
+ 'backend': {
+ 'SparqlStore': {}},
+ }})
+ store.migrate(bsie.schema)
+ # process files
+ def handle(node, pred, value):
+ store.node(node.node_type, node.uri).set(pred.uri, value)
+ walk(handle)
+ # return store
+ return store
+
+
+
+## main ##
+
+if __name__ == '__main__':
+ import sys
+ main(sys.argv[1:])
+
+## EOF ##
diff --git a/bsie/apps/info.py b/bsie/apps/info.py
new file mode 100644
index 0000000..8cc6dca
--- /dev/null
+++ b/bsie/apps/info.py
@@ -0,0 +1,74 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import argparse
+import sys
+import typing
+
+# bsie imports
+from bsie.base import errors
+from bsie.tools import builder
+from bsie.utils.bsfs import URI
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'main',
+ )
+
+
+## code ##
+
+def main(argv):
+ """Show information from BSIE."""
+ parser = argparse.ArgumentParser(description=main.__doc__, prog='info')
+ parser.add_argument('what', choices=('predicates', ),
+ help='Select what information to show.')
+ args = parser.parse_args(argv)
+
+ # FIXME: Read reader/extractor configs from a config file
+ # reader builder
+ rbuild = builder.ReaderBuilder({})
+ # extractor builder
+ ebuild = builder.ExtractorBuilder([
+ {'bsie.extractor.generic.path.Path': {}},
+ {'bsie.extractor.generic.stat.Stat': {}},
+ {'bsie.extractor.generic.constant.Constant': dict(
+ tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
+ schema='''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ ''',
+ )},
+ ])
+ # pipeline builder
+ pbuild = builder.PipelineBuilder(
+ URI('http://example.com/me/file#'), # not actually used
+ rbuild,
+ ebuild,
+ )
+
+ # build pipeline
+ pipeline = pbuild.build()
+
+ # show info
+ if args.what == 'predicates':
+ # show predicates
+ for pred in pipeline.schema.predicates():
+ print(pred.uri)
+ else:
+ # args.what is already checked by argparse
+ raise errors.UnreachableError()
+
+
+## main ##
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
+
+## EOF ##
diff --git a/bsie/base/errors.py b/bsie/base/errors.py
index 760351f..dc3c30e 100644
--- a/bsie/base/errors.py
+++ b/bsie/base/errors.py
@@ -33,4 +33,10 @@ class ExtractorError(_BSIEError):
class ReaderError(_BSIEError):
"""The Reader failed to read the given file."""
+class ProgrammingError(_BSIEError):
+ """An assertion-like error that indicates a code-base issue."""
+
+class UnreachableError(ProgrammingError):
+ """Bravo, you've reached a point in code that should logically not be reachable."""
+
## EOF ##
diff --git a/bsie/lib/__init__.py b/bsie/lib/__init__.py
new file mode 100644
index 0000000..f6c9018
--- /dev/null
+++ b/bsie/lib/__init__.py
@@ -0,0 +1,13 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# exports
+__all__: typing.Sequence[str] = []
+
+## EOF ##
diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py
new file mode 100644
index 0000000..aeccc8c
--- /dev/null
+++ b/bsie/lib/bsie.py
@@ -0,0 +1,80 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsie imports
+from bsie.tools.pipeline import Pipeline
+from bsie.utils import node, ns
+from bsie.utils.bsfs import URI, schema as schema_
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'BSIE',
+ )
+
+
+## code ##
+
+class BSIE():
+ """Extract triples from files.
+
+ Controls which predicates to extract (*collect*) and
+ which to not extract (*discard*). Note that this only affects
+ principal predicates not auxililary predicates like, e.g., tag labels.
+
+ """
+
+ # predicates to extract.
+ predicates: typing.Set[URI]
+
+ # local schema.
+ schema: schema_.Schema
+
+ def __init__(
+ self,
+ # pipeline builder.
+ pipeline: Pipeline,
+ # predicates to extract at most. None implies all available w.r.t. extractors.
+ collect: typing.Optional[typing.Iterable[URI]] = None,
+ # predicates to discard.
+ discard: typing.Optional[typing.Iterable[URI]] = None,
+ ):
+ # store pipeline
+ self.pipeline = pipeline
+ # start off with available predicates
+ self.predicates = {pred.uri for pred in self.pipeline.predicates()}
+ # limit predicates to specified ones by argument.
+ if collect is not None:
+ collect = set(collect)
+ if len(collect) > 0:
+ self.predicates &= collect
+ # discard predicates.
+ if discard is not None:
+ self.predicates -= set(discard)
+ # discard ns.bsfs.Predicate
+ self.predicates.discard(ns.bsfs.Predicate)
+ # compile a schema that only contains the requested predicates (and implied types)
+ self.schema = schema_.Schema({
+ self.pipeline.schema.predicate(pred) for pred in self.predicates})
+
+ def from_file(
+ self,
+ path: URI,
+ predicates: typing.Optional[typing.Iterable[URI]] = None,
+ ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]:
+ """Produce triples for a given *path*. Limit to *predicates* if given."""
+ # get requested predicates.
+ predicates = set(predicates) if predicates is not None else self.predicates
+ # filter through requested predicates.
+ predicates &= self.predicates
+ # predicate lookup
+ predicates = {self.schema.predicate(pred) for pred in predicates}
+ # invoke pipeline
+ yield from self.pipeline(path, predicates)
+
+## EOF ##
diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py
index 8e1c992..da422c0 100644
--- a/bsie/tools/pipeline.py
+++ b/bsie/tools/pipeline.py
@@ -70,6 +70,10 @@ class Pipeline():
and self._prefix == other._prefix \
and self._ext2rdr == other._ext2rdr
+ def predicates(self) -> typing.Iterator[_schema.Predicate]:
+ """Return the predicates that are extracted from a file."""
+ return iter({pred for ext in self._ext2rdr for pred in ext.predicates()})
+
def __call__(
self,
path: URI,
diff --git a/bsie/utils/namespaces.py b/bsie/utils/namespaces.py
index 13be96b..2fcb2dc 100644
--- a/bsie/utils/namespaces.py
+++ b/bsie/utils/namespaces.py
@@ -13,7 +13,7 @@ from . import bsfs as _bsfs
# constants
bse = _bsfs.Namespace('http://bsfs.ai/schema/Entity#')
bsfs = _bsfs.Namespace('http://bsfs.ai/schema/')
-bsm = _bsfs.Namespace('http://bsfs.ai/schema/meta#')
+bsm = _bsfs.Namespace('http://bsfs.ai/schema/Meta#')
xsd = _bsfs.Namespace('http://www.w3.org/2001/XMLSchema#')
# export