diff options
Diffstat (limited to 'bsie/lib/bsie.py')
-rw-r--r-- | bsie/lib/bsie.py | 92 |
1 files changed, 92 insertions, 0 deletions
diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py new file mode 100644 index 0000000..e087fa9 --- /dev/null +++ b/bsie/lib/bsie.py @@ -0,0 +1,92 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# bsie imports +from bsie.tools import Pipeline +from bsie.utils import bsfs, node, ns + +# exports +__all__: typing.Sequence[str] = ( + 'BSIE', + ) + + +## code ## + +class BSIE(): + """Extract triples from files. + + Controls which predicates to extract (*collect*) and + which to not extract (*discard*). Note that this only affects + principal predicates not auxililary predicates like, e.g., tag labels. + + """ + + # pipeline + _pipeline: Pipeline + + # predicates to extract. + _principals: typing.Set[bsfs.URI] + + # local schema. + _schema: bsfs.schema.Schema + + def __init__( + self, + # pipeline builder. + pipeline: Pipeline, + # principals to extract at most. None implies all available w.r.t. extractors. + collect: typing.Optional[typing.Iterable[bsfs.URI]] = None, + # principals to discard. + discard: typing.Optional[typing.Iterable[bsfs.URI]] = None, + ): + # store pipeline + self._pipeline = pipeline + # start off with available principals + self._principals = {pred.uri for pred in self._pipeline.principals} + # limit principals to specified ones by argument. + if collect is not None: + collect = set(collect) + if len(collect) > 0: + self._principals &= collect + # discard principals. + if discard is not None: + self._principals -= set(discard) + # discard ns.bsfs.Predicate + self._principals.discard(ns.bsfs.Predicate) + # compile a schema that only contains the requested principals (and auxiliary predicates) + self._schema = self._pipeline.subschema( + self._pipeline.schema.predicate(pred) for pred in self._principals) + + @property + def schema(self) -> bsfs.schema.Schema: + """Return the BSIE schema.""" + return self._schema + + @property + def principals(self) -> typing.Iterator[bsfs.URI]: + """Return an iterator to the principal predicates.""" + return iter(self._principals) + + def from_file( + self, + path: bsfs.URI, + principals: typing.Optional[typing.Iterable[bsfs.URI]] = None, + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.URI, typing.Any]]: + """Produce triples for a given *path*. Limit to *principals* if given.""" + # get requested principals. + principals = set(principals) if principals is not None else self._principals + # filter through requested principals. + principals &= self._principals + # predicate lookup + principals = {self.schema.predicate(pred) for pred in principals} + # invoke pipeline + yield from self._pipeline(path, principals) + +## EOF ## |