From 37510d134458bf954ca2da6d40be0d6c76661e8c Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Thu, 15 Dec 2022 17:19:21 +0100 Subject: bsie/pipeline interface revision: * predicates -> principals * schema as property * principals as property * information hiding * full subschema instead of only predicates --- bsie/lib/bsie.py | 61 ++++++++++++++++++++++--------------- bsie/tools/pipeline.py | 52 ++++++++++++++++++++----------- test/lib/test_bsie.py | 74 +++++++-------------------------------------- test/tools/test_pipeline.py | 5 ++- 4 files changed, 83 insertions(+), 109 deletions(-) diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py index 3aeee2b..e087fa9 100644 --- a/bsie/lib/bsie.py +++ b/bsie/lib/bsie.py @@ -8,8 +8,6 @@ Author: Matthias Baumgartner, 2022 import typing # bsie imports -from bsie.tools.pipeline import Pipeline -from bsie.utils.bsfs import URI, schema as schema_ from bsie.tools import Pipeline from bsie.utils import bsfs, node, ns @@ -30,11 +28,14 @@ class BSIE(): """ + # pipeline + _pipeline: Pipeline + # predicates to extract. - predicates: typing.Set[URI] + _principals: typing.Set[bsfs.URI] # local schema. - schema: schema_.Schema + _schema: bsfs.schema.Schema def __init__( self, @@ -46,36 +47,46 @@ class BSIE(): discard: typing.Optional[typing.Iterable[bsfs.URI]] = None, ): # store pipeline - self.pipeline = pipeline - # start off with available predicates - self.predicates = {pred.uri for pred in self.pipeline.predicates()} - # limit predicates to specified ones by argument. + self._pipeline = pipeline + # start off with available principals + self._principals = {pred.uri for pred in self._pipeline.principals} + # limit principals to specified ones by argument. if collect is not None: collect = set(collect) if len(collect) > 0: - self.predicates &= collect - # discard predicates. + self._principals &= collect + # discard principals. if discard is not None: - self.predicates -= set(discard) + self._principals -= set(discard) # discard ns.bsfs.Predicate - self.predicates.discard(ns.bsfs.Predicate) - # compile a schema that only contains the requested predicates (and implied types) - self.schema = schema_.Schema({ - self.pipeline.schema.predicate(pred) for pred in self.predicates}) + self._principals.discard(ns.bsfs.Predicate) + # compile a schema that only contains the requested principals (and auxiliary predicates) + self._schema = self._pipeline.subschema( + self._pipeline.schema.predicate(pred) for pred in self._principals) + + @property + def schema(self) -> bsfs.schema.Schema: + """Return the BSIE schema.""" + return self._schema + + @property + def principals(self) -> typing.Iterator[bsfs.URI]: + """Return an iterator to the principal predicates.""" + return iter(self._principals) def from_file( self, - path: URI, - predicates: typing.Optional[typing.Iterable[URI]] = None, - ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]: - """Produce triples for a given *path*. Limit to *predicates* if given.""" - # get requested predicates. - predicates = set(predicates) if predicates is not None else self.predicates - # filter through requested predicates. - predicates &= self.predicates + path: bsfs.URI, + principals: typing.Optional[typing.Iterable[bsfs.URI]] = None, + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.URI, typing.Any]]: + """Produce triples for a given *path*. Limit to *principals* if given.""" + # get requested principals. + principals = set(principals) if principals is not None else self._principals + # filter through requested principals. + principals &= self._principals # predicate lookup - predicates = {self.schema.predicate(pred) for pred in predicates} + principals = {self.schema.predicate(pred) for pred in principals} # invoke pipeline - yield from self.pipeline(path, predicates) + yield from self._pipeline(path, principals) ## EOF ## diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py index 834bd99..52ce526 100644 --- a/bsie/tools/pipeline.py +++ b/bsie/tools/pipeline.py @@ -11,8 +11,6 @@ import typing # bsie imports from bsie import base -from bsie.utils.node import Node -from bsie.utils.bsfs import schema as _schema, URI, uuid as _uuid, typename from bsie.utils import bsfs, node, ns # exports @@ -36,7 +34,7 @@ class Pipeline(): """ # combined extractor schemas. - schema: _schema.Schema + _schema: bsfs.schema.Schema # node prefix. _prefix: bsfs.Namespace @@ -53,7 +51,7 @@ class Pipeline(): self._prefix = prefix self._ext2rdr = ext2rdr # compile schema from all extractors - self.schema = _schema.Schema.Union(ext.schema for ext in ext2rdr) + self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr) def __str__(self) -> str: return bsfs.typename(self) @@ -62,29 +60,47 @@ class Pipeline(): return f'{bsfs.typename(self)}(...)' def __hash__(self) -> int: - return hash((type(self), self._prefix, self.schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values()))) + return hash((type(self), self._prefix, self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values()))) def __eq__(self, other: typing.Any) -> bool: return isinstance(other, type(self)) \ - and self.schema == other.schema \ + and self._schema == other._schema \ and self._prefix == other._prefix \ and self._ext2rdr == other._ext2rdr - def predicates(self) -> typing.Iterator[_schema.Predicate]: - """Return the predicates that are extracted from a file.""" - return iter({pred for ext in self._ext2rdr for pred in ext.predicates()}) + @property + def schema(self) -> bsfs.schema.Schema: + """Return the pipeline's schema (combined from all extractors).""" + return self._schema + + @property + def principals(self) -> typing.Iterator[bsfs.schema.Predicate]: + """Return the principal predicates that can be extracted.""" + return iter({pred for ext in self._ext2rdr for pred in ext.principals}) + + def subschema(self, principals: typing.Iterable[bsfs.schema.Predicate]) -> bsfs.schema.Schema: + """Return the subset of the schema that supports the given *principals*.""" + # materialize principals + principals = set(principals) + # collect and combine schemas from extractors + return bsfs.schema.Schema.Union({ + ext.schema + for ext + in self._ext2rdr + if not set(ext.principals).isdisjoint(principals) + }) def __call__( self, - path: URI, - predicates: typing.Optional[typing.Iterable[_schema.Predicate]] = None, - ) -> typing.Iterator[typing.Tuple[Node, _schema.Predicate, typing.Any]]: - """Extract triples from the file at *path*. Optionally, limit triples to *predicates*.""" - # get predicates - predicates = set(predicates) if predicates is not None else set(self.schema.predicates()) + path: bsfs.URI, + principals: typing.Optional[typing.Iterable[bsfs.schema.Predicate]] = None, + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + """Extract triples from the file at *path*. Optionally, limit triples to *principals*.""" + # get principals + principals = set(principals) if principals is not None else set(self.schema.predicates()) # get extractors - extractors = {ext for ext in self._ext2rdr if not set(ext.predicates()).isdisjoint(predicates)} + extractors = {ext for ext in self._ext2rdr if not set(ext.principals).isdisjoint(principals)} # corner-case short-cut if len(extractors) == 0: @@ -110,8 +126,8 @@ class Pipeline(): for ext in extrs: try: # get predicate/value tuples - for node, pred, value in ext.extract(subject, content, predicates): - yield node, pred, value + for subject, pred, value in ext.extract(subject, content, principals): + yield subject, pred, value except base.errors.ExtractorError as err: # critical extractor failure. diff --git a/test/lib/test_bsie.py b/test/lib/test_bsie.py index 43e7b1d..f3f476e 100644 --- a/test/lib/test_bsie.py +++ b/test/lib/test_bsie.py @@ -11,8 +11,6 @@ import unittest # bsie imports from bsie.base import extractor from bsie.tools import builder -from bsie.utils.bsfs import URI, schema -from bsie.utils.node import Node from bsie.utils import bsfs, node, ns # objects to test @@ -47,22 +45,12 @@ class TestBSIE(unittest.TestCase): def test_construction(self): # pipeline only lib = BSIE(self.pipeline) - self.assertSetEqual(lib.predicates, { + self.assertSetEqual(set(lib.principals), { ns.bse.filename, ns.bse.filesize, ns.bse.author, }) - self.assertEqual(lib.schema, schema.Schema.from_string(''' - prefix rdfs: - prefix xsd: - prefix bsfs: - prefix bse: - # essential nodes - bsfs:Entity rdfs:subClassOf bsfs:Node . - # common definitions - xsd:string rdfs:subClassOf bsfs:Literal . - xsd:integer rdfs:subClassOf bsfs:Literal . - + self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' bse:filename rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:File ; rdfs:range xsd:string ; @@ -85,21 +73,11 @@ class TestBSIE(unittest.TestCase): ns.bse.author, ns.bse.inexistent, }) - self.assertSetEqual(lib.predicates, { + self.assertSetEqual(set(lib.principals), { ns.bse.filesize, ns.bse.author, }) - self.assertEqual(lib.schema, schema.Schema.from_string(''' - prefix rdfs: - prefix xsd: - prefix bsfs: - prefix bse: - # essential nodes - bsfs:Entity rdfs:subClassOf bsfs:Node . - # common definitions - xsd:string rdfs:subClassOf bsfs:Literal . - xsd:integer rdfs:subClassOf bsfs:Literal . - + self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' bse:filesize rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:File ; rdfs:range xsd:integer; @@ -109,26 +87,15 @@ class TestBSIE(unittest.TestCase): rdfs:domain bsfs:Entity ; rdfs:range xsd:string ; bsfs:unique "true"^^xsd:boolean . - ''')) # empty collect is disregarded lib = BSIE(self.pipeline, collect={}) - self.assertSetEqual(lib.predicates, { + self.assertSetEqual(set(lib.principals), { ns.bse.filename, ns.bse.filesize, ns.bse.author, }) - self.assertEqual(lib.schema, schema.Schema.from_string(''' - prefix rdfs: - prefix xsd: - prefix bsfs: - prefix bse: - # essential nodes - bsfs:Entity rdfs:subClassOf bsfs:Node . - # common definitions - xsd:string rdfs:subClassOf bsfs:Literal . - xsd:integer rdfs:subClassOf bsfs:Literal . - + self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' bse:filename rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:File ; rdfs:range xsd:string ; @@ -152,24 +119,14 @@ class TestBSIE(unittest.TestCase): ns.bse.filename, ns.bse.inexistent, }) - self.assertSetEqual(lib.predicates, { + self.assertSetEqual(set(lib.principals), { ns.bse.author, }) - self.assertEqual(lib.schema, schema.Schema.from_string(''' - prefix rdfs: - prefix xsd: - prefix bsfs: - prefix bse: - # essential nodes - bsfs:Entity rdfs:subClassOf bsfs:Node . - # common definitions - xsd:string rdfs:subClassOf bsfs:Literal . - + self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' bse:author rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:Entity ; rdfs:range xsd:string ; bsfs:unique "true"^^xsd:boolean . - ''')) # specify collect and discard @@ -177,19 +134,10 @@ class TestBSIE(unittest.TestCase): collect={ns.bse.filesize, ns.bse.author, ns.bse.foo, ns.bse.bar}, discard={ns.bse.author, ns.bse.foo, ns.bse.foobar}, ) - self.assertSetEqual(lib.predicates, { + self.assertSetEqual(set(lib.principals), { ns.bse.filesize, }) - self.assertEqual(lib.schema, schema.Schema.from_string(''' - prefix rdfs: - prefix xsd: - prefix bsfs: - prefix bse: - # essential nodes - bsfs:Entity rdfs:subClassOf bsfs:Node . - # common definitions - xsd:integer rdfs:subClassOf bsfs:Literal . - + self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' bse:filesize rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:File ; rdfs:range xsd:integer; @@ -201,7 +149,7 @@ class TestBSIE(unittest.TestCase): def test_from_file(self): # setup lib = BSIE(self.pipeline) - self.assertSetEqual(set(lib.predicates), { + self.assertSetEqual(set(lib.principals), { ns.bse.filesize, ns.bse.filename, ns.bse.author, diff --git a/test/tools/test_pipeline.py b/test/tools/test_pipeline.py index e440ab5..91bf736 100644 --- a/test/tools/test_pipeline.py +++ b/test/tools/test_pipeline.py @@ -75,7 +75,7 @@ class TestPipeline(unittest.TestCase): # equivalence respects schema p2 = Pipeline(self.prefix, self.ext2rdr) - p2.schema = pipeline.schema.Empty() + p2._schema = pipeline.schema.Empty() self.assertNotEqual(pipeline, p2) self.assertNotEqual(hash(pipeline), hash(p2)) @@ -160,8 +160,7 @@ class TestPipeline(unittest.TestCase): # build pipeline pipeline = Pipeline(self.prefix, self.ext2rdr) # - self.assertSetEqual(set(pipeline.predicates()), { - pipeline.schema.predicate(ns.bsfs.Predicate), + self.assertSetEqual(set(pipeline.principals), { pipeline.schema.predicate(ns.bse.filename), pipeline.schema.predicate(ns.bse.filesize), pipeline.schema.predicate(ns.bse.author), -- cgit v1.2.3