""" Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ # imports import typing # bsie imports from bsie.tools.pipeline import Pipeline from bsie.utils.bsfs import URI, schema as schema_ from bsie.tools import Pipeline from bsie.utils import bsfs, node, ns # exports __all__: typing.Sequence[str] = ( 'BSIE', ) ## code ## class BSIE(): """Extract triples from files. Controls which predicates to extract (*collect*) and which to not extract (*discard*). Note that this only affects principal predicates not auxililary predicates like, e.g., tag labels. """ # predicates to extract. predicates: typing.Set[URI] # local schema. schema: schema_.Schema def __init__( self, # pipeline builder. pipeline: Pipeline, # principals to extract at most. None implies all available w.r.t. extractors. collect: typing.Optional[typing.Iterable[bsfs.URI]] = None, # principals to discard. discard: typing.Optional[typing.Iterable[bsfs.URI]] = None, ): # store pipeline self.pipeline = pipeline # start off with available predicates self.predicates = {pred.uri for pred in self.pipeline.predicates()} # limit predicates to specified ones by argument. if collect is not None: collect = set(collect) if len(collect) > 0: self.predicates &= collect # discard predicates. if discard is not None: self.predicates -= set(discard) # discard ns.bsfs.Predicate self.predicates.discard(ns.bsfs.Predicate) # compile a schema that only contains the requested predicates (and implied types) self.schema = schema_.Schema({ self.pipeline.schema.predicate(pred) for pred in self.predicates}) def from_file( self, path: URI, predicates: typing.Optional[typing.Iterable[URI]] = None, ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]: """Produce triples for a given *path*. Limit to *predicates* if given.""" # get requested predicates. predicates = set(predicates) if predicates is not None else self.predicates # filter through requested predicates. predicates &= self.predicates # predicate lookup predicates = {self.schema.predicate(pred) for pred in predicates} # invoke pipeline yield from self.pipeline(path, predicates) ## EOF ##