aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/lib/bsie.py
diff options
context:
space:
mode:
Diffstat (limited to 'bsie/lib/bsie.py')
-rw-r--r--bsie/lib/bsie.py80
1 files changed, 80 insertions, 0 deletions
diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py
new file mode 100644
index 0000000..aeccc8c
--- /dev/null
+++ b/bsie/lib/bsie.py
@@ -0,0 +1,80 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsie imports
+from bsie.tools.pipeline import Pipeline
+from bsie.utils import node, ns
+from bsie.utils.bsfs import URI, schema as schema_
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'BSIE',
+ )
+
+
+## code ##
+
+class BSIE():
+ """Extract triples from files.
+
+ Controls which predicates to extract (*collect*) and
+ which to not extract (*discard*). Note that this only affects
+ principal predicates not auxililary predicates like, e.g., tag labels.
+
+ """
+
+ # predicates to extract.
+ predicates: typing.Set[URI]
+
+ # local schema.
+ schema: schema_.Schema
+
+ def __init__(
+ self,
+ # pipeline builder.
+ pipeline: Pipeline,
+ # predicates to extract at most. None implies all available w.r.t. extractors.
+ collect: typing.Optional[typing.Iterable[URI]] = None,
+ # predicates to discard.
+ discard: typing.Optional[typing.Iterable[URI]] = None,
+ ):
+ # store pipeline
+ self.pipeline = pipeline
+ # start off with available predicates
+ self.predicates = {pred.uri for pred in self.pipeline.predicates()}
+ # limit predicates to specified ones by argument.
+ if collect is not None:
+ collect = set(collect)
+ if len(collect) > 0:
+ self.predicates &= collect
+ # discard predicates.
+ if discard is not None:
+ self.predicates -= set(discard)
+ # discard ns.bsfs.Predicate
+ self.predicates.discard(ns.bsfs.Predicate)
+ # compile a schema that only contains the requested predicates (and implied types)
+ self.schema = schema_.Schema({
+ self.pipeline.schema.predicate(pred) for pred in self.predicates})
+
+ def from_file(
+ self,
+ path: URI,
+ predicates: typing.Optional[typing.Iterable[URI]] = None,
+ ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]:
+ """Produce triples for a given *path*. Limit to *predicates* if given."""
+ # get requested predicates.
+ predicates = set(predicates) if predicates is not None else self.predicates
+ # filter through requested predicates.
+ predicates &= self.predicates
+ # predicate lookup
+ predicates = {self.schema.predicate(pred) for pred in predicates}
+ # invoke pipeline
+ yield from self.pipeline(path, predicates)
+
+## EOF ##