aboutsummaryrefslogtreecommitdiffstats
path: root/bsie
diff options
context:
space:
mode:
authorMatthias Baumgartner <dev@igsor.net>2022-12-15 17:19:21 +0100
committerMatthias Baumgartner <dev@igsor.net>2022-12-15 17:19:21 +0100
commit37510d134458bf954ca2da6d40be0d6c76661e8c (patch)
tree41e1383485ccecf5f21ac035ffb96765e66d9e05 /bsie
parent3426b4e201cf03b78d2a3f144876955fcda2f66b (diff)
downloadbsie-37510d134458bf954ca2da6d40be0d6c76661e8c.tar.gz
bsie-37510d134458bf954ca2da6d40be0d6c76661e8c.tar.bz2
bsie-37510d134458bf954ca2da6d40be0d6c76661e8c.zip
bsie/pipeline interface revision:
* predicates -> principals * schema as property * principals as property * information hiding * full subschema instead of only predicates
Diffstat (limited to 'bsie')
-rw-r--r--bsie/lib/bsie.py61
-rw-r--r--bsie/tools/pipeline.py52
2 files changed, 70 insertions, 43 deletions
diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py
index 3aeee2b..e087fa9 100644
--- a/bsie/lib/bsie.py
+++ b/bsie/lib/bsie.py
@@ -8,8 +8,6 @@ Author: Matthias Baumgartner, 2022
import typing
# bsie imports
-from bsie.tools.pipeline import Pipeline
-from bsie.utils.bsfs import URI, schema as schema_
from bsie.tools import Pipeline
from bsie.utils import bsfs, node, ns
@@ -30,11 +28,14 @@ class BSIE():
"""
+ # pipeline
+ _pipeline: Pipeline
+
# predicates to extract.
- predicates: typing.Set[URI]
+ _principals: typing.Set[bsfs.URI]
# local schema.
- schema: schema_.Schema
+ _schema: bsfs.schema.Schema
def __init__(
self,
@@ -46,36 +47,46 @@ class BSIE():
discard: typing.Optional[typing.Iterable[bsfs.URI]] = None,
):
# store pipeline
- self.pipeline = pipeline
- # start off with available predicates
- self.predicates = {pred.uri for pred in self.pipeline.predicates()}
- # limit predicates to specified ones by argument.
+ self._pipeline = pipeline
+ # start off with available principals
+ self._principals = {pred.uri for pred in self._pipeline.principals}
+ # limit principals to specified ones by argument.
if collect is not None:
collect = set(collect)
if len(collect) > 0:
- self.predicates &= collect
- # discard predicates.
+ self._principals &= collect
+ # discard principals.
if discard is not None:
- self.predicates -= set(discard)
+ self._principals -= set(discard)
# discard ns.bsfs.Predicate
- self.predicates.discard(ns.bsfs.Predicate)
- # compile a schema that only contains the requested predicates (and implied types)
- self.schema = schema_.Schema({
- self.pipeline.schema.predicate(pred) for pred in self.predicates})
+ self._principals.discard(ns.bsfs.Predicate)
+ # compile a schema that only contains the requested principals (and auxiliary predicates)
+ self._schema = self._pipeline.subschema(
+ self._pipeline.schema.predicate(pred) for pred in self._principals)
+
+ @property
+ def schema(self) -> bsfs.schema.Schema:
+ """Return the BSIE schema."""
+ return self._schema
+
+ @property
+ def principals(self) -> typing.Iterator[bsfs.URI]:
+ """Return an iterator to the principal predicates."""
+ return iter(self._principals)
def from_file(
self,
- path: URI,
- predicates: typing.Optional[typing.Iterable[URI]] = None,
- ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]:
- """Produce triples for a given *path*. Limit to *predicates* if given."""
- # get requested predicates.
- predicates = set(predicates) if predicates is not None else self.predicates
- # filter through requested predicates.
- predicates &= self.predicates
+ path: bsfs.URI,
+ principals: typing.Optional[typing.Iterable[bsfs.URI]] = None,
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.URI, typing.Any]]:
+ """Produce triples for a given *path*. Limit to *principals* if given."""
+ # get requested principals.
+ principals = set(principals) if principals is not None else self._principals
+ # filter through requested principals.
+ principals &= self._principals
# predicate lookup
- predicates = {self.schema.predicate(pred) for pred in predicates}
+ principals = {self.schema.predicate(pred) for pred in principals}
# invoke pipeline
- yield from self.pipeline(path, predicates)
+ yield from self._pipeline(path, principals)
## EOF ##
diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py
index 834bd99..52ce526 100644
--- a/bsie/tools/pipeline.py
+++ b/bsie/tools/pipeline.py
@@ -11,8 +11,6 @@ import typing
# bsie imports
from bsie import base
-from bsie.utils.node import Node
-from bsie.utils.bsfs import schema as _schema, URI, uuid as _uuid, typename
from bsie.utils import bsfs, node, ns
# exports
@@ -36,7 +34,7 @@ class Pipeline():
"""
# combined extractor schemas.
- schema: _schema.Schema
+ _schema: bsfs.schema.Schema
# node prefix.
_prefix: bsfs.Namespace
@@ -53,7 +51,7 @@ class Pipeline():
self._prefix = prefix
self._ext2rdr = ext2rdr
# compile schema from all extractors
- self.schema = _schema.Schema.Union(ext.schema for ext in ext2rdr)
+ self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr)
def __str__(self) -> str:
return bsfs.typename(self)
@@ -62,29 +60,47 @@ class Pipeline():
return f'{bsfs.typename(self)}(...)'
def __hash__(self) -> int:
- return hash((type(self), self._prefix, self.schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values())))
+ return hash((type(self), self._prefix, self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values())))
def __eq__(self, other: typing.Any) -> bool:
return isinstance(other, type(self)) \
- and self.schema == other.schema \
+ and self._schema == other._schema \
and self._prefix == other._prefix \
and self._ext2rdr == other._ext2rdr
- def predicates(self) -> typing.Iterator[_schema.Predicate]:
- """Return the predicates that are extracted from a file."""
- return iter({pred for ext in self._ext2rdr for pred in ext.predicates()})
+ @property
+ def schema(self) -> bsfs.schema.Schema:
+ """Return the pipeline's schema (combined from all extractors)."""
+ return self._schema
+
+ @property
+ def principals(self) -> typing.Iterator[bsfs.schema.Predicate]:
+ """Return the principal predicates that can be extracted."""
+ return iter({pred for ext in self._ext2rdr for pred in ext.principals})
+
+ def subschema(self, principals: typing.Iterable[bsfs.schema.Predicate]) -> bsfs.schema.Schema:
+ """Return the subset of the schema that supports the given *principals*."""
+ # materialize principals
+ principals = set(principals)
+ # collect and combine schemas from extractors
+ return bsfs.schema.Schema.Union({
+ ext.schema
+ for ext
+ in self._ext2rdr
+ if not set(ext.principals).isdisjoint(principals)
+ })
def __call__(
self,
- path: URI,
- predicates: typing.Optional[typing.Iterable[_schema.Predicate]] = None,
- ) -> typing.Iterator[typing.Tuple[Node, _schema.Predicate, typing.Any]]:
- """Extract triples from the file at *path*. Optionally, limit triples to *predicates*."""
- # get predicates
- predicates = set(predicates) if predicates is not None else set(self.schema.predicates())
+ path: bsfs.URI,
+ principals: typing.Optional[typing.Iterable[bsfs.schema.Predicate]] = None,
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ """Extract triples from the file at *path*. Optionally, limit triples to *principals*."""
+ # get principals
+ principals = set(principals) if principals is not None else set(self.schema.predicates())
# get extractors
- extractors = {ext for ext in self._ext2rdr if not set(ext.predicates()).isdisjoint(predicates)}
+ extractors = {ext for ext in self._ext2rdr if not set(ext.principals).isdisjoint(principals)}
# corner-case short-cut
if len(extractors) == 0:
@@ -110,8 +126,8 @@ class Pipeline():
for ext in extrs:
try:
# get predicate/value tuples
- for node, pred, value in ext.extract(subject, content, predicates):
- yield node, pred, value
+ for subject, pred, value in ext.extract(subject, content, principals):
+ yield subject, pred, value
except base.errors.ExtractorError as err:
# critical extractor failure.