""" Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ # standard imports import typing # bsie imports from bsie.utils import bsfs, node, ns # inner-module imports from .pipeline import Pipeline # exports __all__: typing.Sequence[str] = ( 'BSIE', ) ## code ## class BSIE(): """Extract triples from files. Controls which predicates to extract (*collect*) and which to not extract (*discard*). Note that this only affects principal predicates not auxililary predicates like, e.g., tag labels. """ # pipeline _pipeline: Pipeline # predicates to extract. _principals: typing.Set[bsfs.URI] # local schema. _schema: bsfs.schema.Schema def __init__( self, # pipeline builder. pipeline: Pipeline, # principals to extract at most. None implies all available w.r.t. extractors. collect: typing.Optional[typing.Iterable[bsfs.URI]] = None, # principals to discard. discard: typing.Optional[typing.Iterable[bsfs.URI]] = None, ): # store pipeline self._pipeline = pipeline # start off with available principals self._principals = {pred.uri for pred in self._pipeline.principals} # limit principals to specified ones by argument. if collect is not None: collect = set(collect) if len(collect) > 0: self._principals &= collect # discard principals. if discard is not None: self._principals -= set(discard) # discard ns.bsfs.Predicate self._principals.discard(ns.bsfs.Predicate) # compile a schema that only contains the requested principals (and auxiliary predicates) self._schema = self._pipeline.subschema( self._pipeline.schema.predicate(pred) for pred in self._principals) @property def schema(self) -> bsfs.schema.Schema: """Return the BSIE schema.""" return self._schema @property def principals(self) -> typing.Iterator[bsfs.URI]: """Return an iterator to the principal predicates.""" return iter(self._principals) def from_file( self, path: bsfs.URI, principals: typing.Optional[typing.Iterable[bsfs.URI]] = None, ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.URI, typing.Any]]: """Produce triples for a given *path*. Limit to *principals* if given.""" # get requested principals. principals = set(principals) if principals is not None else self._principals # filter through requested principals. principals &= self._principals # predicate lookup principals = {self.schema.predicate(pred) for pred in principals} # invoke pipeline yield from self._pipeline(path, principals) ## EOF ##