From 559e643bb1fa39feefd2eb73847ad9420daf1deb Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Wed, 14 Dec 2022 06:10:25 +0100 Subject: bsie extraction and info apps --- bsie/lib/__init__.py | 13 +++++++++ bsie/lib/bsie.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 bsie/lib/__init__.py create mode 100644 bsie/lib/bsie.py (limited to 'bsie/lib') diff --git a/bsie/lib/__init__.py b/bsie/lib/__init__.py new file mode 100644 index 0000000..f6c9018 --- /dev/null +++ b/bsie/lib/__init__.py @@ -0,0 +1,13 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# exports +__all__: typing.Sequence[str] = [] + +## EOF ## diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py new file mode 100644 index 0000000..aeccc8c --- /dev/null +++ b/bsie/lib/bsie.py @@ -0,0 +1,80 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# bsie imports +from bsie.tools.pipeline import Pipeline +from bsie.utils import node, ns +from bsie.utils.bsfs import URI, schema as schema_ + +# exports +__all__: typing.Sequence[str] = ( + 'BSIE', + ) + + +## code ## + +class BSIE(): + """Extract triples from files. + + Controls which predicates to extract (*collect*) and + which to not extract (*discard*). Note that this only affects + principal predicates not auxililary predicates like, e.g., tag labels. + + """ + + # predicates to extract. + predicates: typing.Set[URI] + + # local schema. + schema: schema_.Schema + + def __init__( + self, + # pipeline builder. + pipeline: Pipeline, + # predicates to extract at most. None implies all available w.r.t. extractors. + collect: typing.Optional[typing.Iterable[URI]] = None, + # predicates to discard. + discard: typing.Optional[typing.Iterable[URI]] = None, + ): + # store pipeline + self.pipeline = pipeline + # start off with available predicates + self.predicates = {pred.uri for pred in self.pipeline.predicates()} + # limit predicates to specified ones by argument. + if collect is not None: + collect = set(collect) + if len(collect) > 0: + self.predicates &= collect + # discard predicates. + if discard is not None: + self.predicates -= set(discard) + # discard ns.bsfs.Predicate + self.predicates.discard(ns.bsfs.Predicate) + # compile a schema that only contains the requested predicates (and implied types) + self.schema = schema_.Schema({ + self.pipeline.schema.predicate(pred) for pred in self.predicates}) + + def from_file( + self, + path: URI, + predicates: typing.Optional[typing.Iterable[URI]] = None, + ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]: + """Produce triples for a given *path*. Limit to *predicates* if given.""" + # get requested predicates. + predicates = set(predicates) if predicates is not None else self.predicates + # filter through requested predicates. + predicates &= self.predicates + # predicate lookup + predicates = {self.schema.predicate(pred) for pred in predicates} + # invoke pipeline + yield from self.pipeline(path, predicates) + +## EOF ## -- cgit v1.2.3 From 49cf03fc212c813862453de5352436dc90d1e458 Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Thu, 15 Dec 2022 16:50:53 +0100 Subject: imports and init files --- bsie/lib/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'bsie/lib') diff --git a/bsie/lib/__init__.py b/bsie/lib/__init__.py index f6c9018..578c2c4 100644 --- a/bsie/lib/__init__.py +++ b/bsie/lib/__init__.py @@ -7,7 +7,12 @@ Author: Matthias Baumgartner, 2022 # imports import typing +# inner-module imports +from .bsie import BSIE + # exports -__all__: typing.Sequence[str] = [] +__all__: typing.Sequence[str] = ( + 'BSIE', + ) ## EOF ## -- cgit v1.2.3 From 3b7fee369924eb7704709edeb8c17fff9c020dfb Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Thu, 15 Dec 2022 17:06:09 +0100 Subject: import fixes --- bsie/lib/bsie.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'bsie/lib') diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py index aeccc8c..3aeee2b 100644 --- a/bsie/lib/bsie.py +++ b/bsie/lib/bsie.py @@ -9,8 +9,9 @@ import typing # bsie imports from bsie.tools.pipeline import Pipeline -from bsie.utils import node, ns from bsie.utils.bsfs import URI, schema as schema_ +from bsie.tools import Pipeline +from bsie.utils import bsfs, node, ns # exports __all__: typing.Sequence[str] = ( @@ -39,10 +40,10 @@ class BSIE(): self, # pipeline builder. pipeline: Pipeline, - # predicates to extract at most. None implies all available w.r.t. extractors. - collect: typing.Optional[typing.Iterable[URI]] = None, - # predicates to discard. - discard: typing.Optional[typing.Iterable[URI]] = None, + # principals to extract at most. None implies all available w.r.t. extractors. + collect: typing.Optional[typing.Iterable[bsfs.URI]] = None, + # principals to discard. + discard: typing.Optional[typing.Iterable[bsfs.URI]] = None, ): # store pipeline self.pipeline = pipeline -- cgit v1.2.3 From 37510d134458bf954ca2da6d40be0d6c76661e8c Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Thu, 15 Dec 2022 17:19:21 +0100 Subject: bsie/pipeline interface revision: * predicates -> principals * schema as property * principals as property * information hiding * full subschema instead of only predicates --- bsie/lib/bsie.py | 61 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 25 deletions(-) (limited to 'bsie/lib') diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py index 3aeee2b..e087fa9 100644 --- a/bsie/lib/bsie.py +++ b/bsie/lib/bsie.py @@ -8,8 +8,6 @@ Author: Matthias Baumgartner, 2022 import typing # bsie imports -from bsie.tools.pipeline import Pipeline -from bsie.utils.bsfs import URI, schema as schema_ from bsie.tools import Pipeline from bsie.utils import bsfs, node, ns @@ -30,11 +28,14 @@ class BSIE(): """ + # pipeline + _pipeline: Pipeline + # predicates to extract. - predicates: typing.Set[URI] + _principals: typing.Set[bsfs.URI] # local schema. - schema: schema_.Schema + _schema: bsfs.schema.Schema def __init__( self, @@ -46,36 +47,46 @@ class BSIE(): discard: typing.Optional[typing.Iterable[bsfs.URI]] = None, ): # store pipeline - self.pipeline = pipeline - # start off with available predicates - self.predicates = {pred.uri for pred in self.pipeline.predicates()} - # limit predicates to specified ones by argument. + self._pipeline = pipeline + # start off with available principals + self._principals = {pred.uri for pred in self._pipeline.principals} + # limit principals to specified ones by argument. if collect is not None: collect = set(collect) if len(collect) > 0: - self.predicates &= collect - # discard predicates. + self._principals &= collect + # discard principals. if discard is not None: - self.predicates -= set(discard) + self._principals -= set(discard) # discard ns.bsfs.Predicate - self.predicates.discard(ns.bsfs.Predicate) - # compile a schema that only contains the requested predicates (and implied types) - self.schema = schema_.Schema({ - self.pipeline.schema.predicate(pred) for pred in self.predicates}) + self._principals.discard(ns.bsfs.Predicate) + # compile a schema that only contains the requested principals (and auxiliary predicates) + self._schema = self._pipeline.subschema( + self._pipeline.schema.predicate(pred) for pred in self._principals) + + @property + def schema(self) -> bsfs.schema.Schema: + """Return the BSIE schema.""" + return self._schema + + @property + def principals(self) -> typing.Iterator[bsfs.URI]: + """Return an iterator to the principal predicates.""" + return iter(self._principals) def from_file( self, - path: URI, - predicates: typing.Optional[typing.Iterable[URI]] = None, - ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]: - """Produce triples for a given *path*. Limit to *predicates* if given.""" - # get requested predicates. - predicates = set(predicates) if predicates is not None else self.predicates - # filter through requested predicates. - predicates &= self.predicates + path: bsfs.URI, + principals: typing.Optional[typing.Iterable[bsfs.URI]] = None, + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.URI, typing.Any]]: + """Produce triples for a given *path*. Limit to *principals* if given.""" + # get requested principals. + principals = set(principals) if principals is not None else self._principals + # filter through requested principals. + principals &= self._principals # predicate lookup - predicates = {self.schema.predicate(pred) for pred in predicates} + principals = {self.schema.predicate(pred) for pred in principals} # invoke pipeline - yield from self.pipeline(path, predicates) + yield from self._pipeline(path, principals) ## EOF ## -- cgit v1.2.3