aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--bsie/lib/bsie.py61
-rw-r--r--bsie/tools/pipeline.py52
-rw-r--r--test/lib/test_bsie.py74
-rw-r--r--test/tools/test_pipeline.py5
4 files changed, 83 insertions, 109 deletions
diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py
index 3aeee2b..e087fa9 100644
--- a/bsie/lib/bsie.py
+++ b/bsie/lib/bsie.py
@@ -8,8 +8,6 @@ Author: Matthias Baumgartner, 2022
import typing
# bsie imports
-from bsie.tools.pipeline import Pipeline
-from bsie.utils.bsfs import URI, schema as schema_
from bsie.tools import Pipeline
from bsie.utils import bsfs, node, ns
@@ -30,11 +28,14 @@ class BSIE():
"""
+ # pipeline
+ _pipeline: Pipeline
+
# predicates to extract.
- predicates: typing.Set[URI]
+ _principals: typing.Set[bsfs.URI]
# local schema.
- schema: schema_.Schema
+ _schema: bsfs.schema.Schema
def __init__(
self,
@@ -46,36 +47,46 @@ class BSIE():
discard: typing.Optional[typing.Iterable[bsfs.URI]] = None,
):
# store pipeline
- self.pipeline = pipeline
- # start off with available predicates
- self.predicates = {pred.uri for pred in self.pipeline.predicates()}
- # limit predicates to specified ones by argument.
+ self._pipeline = pipeline
+ # start off with available principals
+ self._principals = {pred.uri for pred in self._pipeline.principals}
+ # limit principals to specified ones by argument.
if collect is not None:
collect = set(collect)
if len(collect) > 0:
- self.predicates &= collect
- # discard predicates.
+ self._principals &= collect
+ # discard principals.
if discard is not None:
- self.predicates -= set(discard)
+ self._principals -= set(discard)
# discard ns.bsfs.Predicate
- self.predicates.discard(ns.bsfs.Predicate)
- # compile a schema that only contains the requested predicates (and implied types)
- self.schema = schema_.Schema({
- self.pipeline.schema.predicate(pred) for pred in self.predicates})
+ self._principals.discard(ns.bsfs.Predicate)
+ # compile a schema that only contains the requested principals (and auxiliary predicates)
+ self._schema = self._pipeline.subschema(
+ self._pipeline.schema.predicate(pred) for pred in self._principals)
+
+ @property
+ def schema(self) -> bsfs.schema.Schema:
+ """Return the BSIE schema."""
+ return self._schema
+
+ @property
+ def principals(self) -> typing.Iterator[bsfs.URI]:
+ """Return an iterator to the principal predicates."""
+ return iter(self._principals)
def from_file(
self,
- path: URI,
- predicates: typing.Optional[typing.Iterable[URI]] = None,
- ) -> typing.Iterator[typing.Tuple[node.Node, URI, typing.Any]]:
- """Produce triples for a given *path*. Limit to *predicates* if given."""
- # get requested predicates.
- predicates = set(predicates) if predicates is not None else self.predicates
- # filter through requested predicates.
- predicates &= self.predicates
+ path: bsfs.URI,
+ principals: typing.Optional[typing.Iterable[bsfs.URI]] = None,
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.URI, typing.Any]]:
+ """Produce triples for a given *path*. Limit to *principals* if given."""
+ # get requested principals.
+ principals = set(principals) if principals is not None else self._principals
+ # filter through requested principals.
+ principals &= self._principals
# predicate lookup
- predicates = {self.schema.predicate(pred) for pred in predicates}
+ principals = {self.schema.predicate(pred) for pred in principals}
# invoke pipeline
- yield from self.pipeline(path, predicates)
+ yield from self._pipeline(path, principals)
## EOF ##
diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py
index 834bd99..52ce526 100644
--- a/bsie/tools/pipeline.py
+++ b/bsie/tools/pipeline.py
@@ -11,8 +11,6 @@ import typing
# bsie imports
from bsie import base
-from bsie.utils.node import Node
-from bsie.utils.bsfs import schema as _schema, URI, uuid as _uuid, typename
from bsie.utils import bsfs, node, ns
# exports
@@ -36,7 +34,7 @@ class Pipeline():
"""
# combined extractor schemas.
- schema: _schema.Schema
+ _schema: bsfs.schema.Schema
# node prefix.
_prefix: bsfs.Namespace
@@ -53,7 +51,7 @@ class Pipeline():
self._prefix = prefix
self._ext2rdr = ext2rdr
# compile schema from all extractors
- self.schema = _schema.Schema.Union(ext.schema for ext in ext2rdr)
+ self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr)
def __str__(self) -> str:
return bsfs.typename(self)
@@ -62,29 +60,47 @@ class Pipeline():
return f'{bsfs.typename(self)}(...)'
def __hash__(self) -> int:
- return hash((type(self), self._prefix, self.schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values())))
+ return hash((type(self), self._prefix, self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values())))
def __eq__(self, other: typing.Any) -> bool:
return isinstance(other, type(self)) \
- and self.schema == other.schema \
+ and self._schema == other._schema \
and self._prefix == other._prefix \
and self._ext2rdr == other._ext2rdr
- def predicates(self) -> typing.Iterator[_schema.Predicate]:
- """Return the predicates that are extracted from a file."""
- return iter({pred for ext in self._ext2rdr for pred in ext.predicates()})
+ @property
+ def schema(self) -> bsfs.schema.Schema:
+ """Return the pipeline's schema (combined from all extractors)."""
+ return self._schema
+
+ @property
+ def principals(self) -> typing.Iterator[bsfs.schema.Predicate]:
+ """Return the principal predicates that can be extracted."""
+ return iter({pred for ext in self._ext2rdr for pred in ext.principals})
+
+ def subschema(self, principals: typing.Iterable[bsfs.schema.Predicate]) -> bsfs.schema.Schema:
+ """Return the subset of the schema that supports the given *principals*."""
+ # materialize principals
+ principals = set(principals)
+ # collect and combine schemas from extractors
+ return bsfs.schema.Schema.Union({
+ ext.schema
+ for ext
+ in self._ext2rdr
+ if not set(ext.principals).isdisjoint(principals)
+ })
def __call__(
self,
- path: URI,
- predicates: typing.Optional[typing.Iterable[_schema.Predicate]] = None,
- ) -> typing.Iterator[typing.Tuple[Node, _schema.Predicate, typing.Any]]:
- """Extract triples from the file at *path*. Optionally, limit triples to *predicates*."""
- # get predicates
- predicates = set(predicates) if predicates is not None else set(self.schema.predicates())
+ path: bsfs.URI,
+ principals: typing.Optional[typing.Iterable[bsfs.schema.Predicate]] = None,
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ """Extract triples from the file at *path*. Optionally, limit triples to *principals*."""
+ # get principals
+ principals = set(principals) if principals is not None else set(self.schema.predicates())
# get extractors
- extractors = {ext for ext in self._ext2rdr if not set(ext.predicates()).isdisjoint(predicates)}
+ extractors = {ext for ext in self._ext2rdr if not set(ext.principals).isdisjoint(principals)}
# corner-case short-cut
if len(extractors) == 0:
@@ -110,8 +126,8 @@ class Pipeline():
for ext in extrs:
try:
# get predicate/value tuples
- for node, pred, value in ext.extract(subject, content, predicates):
- yield node, pred, value
+ for subject, pred, value in ext.extract(subject, content, principals):
+ yield subject, pred, value
except base.errors.ExtractorError as err:
# critical extractor failure.
diff --git a/test/lib/test_bsie.py b/test/lib/test_bsie.py
index 43e7b1d..f3f476e 100644
--- a/test/lib/test_bsie.py
+++ b/test/lib/test_bsie.py
@@ -11,8 +11,6 @@ import unittest
# bsie imports
from bsie.base import extractor
from bsie.tools import builder
-from bsie.utils.bsfs import URI, schema
-from bsie.utils.node import Node
from bsie.utils import bsfs, node, ns
# objects to test
@@ -47,22 +45,12 @@ class TestBSIE(unittest.TestCase):
def test_construction(self):
# pipeline only
lib = BSIE(self.pipeline)
- self.assertSetEqual(lib.predicates, {
+ self.assertSetEqual(set(lib.principals), {
ns.bse.filename,
ns.bse.filesize,
ns.bse.author,
})
- self.assertEqual(lib.schema, schema.Schema.from_string('''
- prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
- prefix xsd: <http://www.w3.org/2001/XMLSchema#>
- prefix bsfs: <http://bsfs.ai/schema/>
- prefix bse: <http://bsfs.ai/schema/Entity#>
- # essential nodes
- bsfs:Entity rdfs:subClassOf bsfs:Node .
- # common definitions
- xsd:string rdfs:subClassOf bsfs:Literal .
- xsd:integer rdfs:subClassOf bsfs:Literal .
-
+ self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
bse:filename rdfs:subClassOf bsfs:Predicate ;
rdfs:domain bsfs:File ;
rdfs:range xsd:string ;
@@ -85,21 +73,11 @@ class TestBSIE(unittest.TestCase):
ns.bse.author,
ns.bse.inexistent,
})
- self.assertSetEqual(lib.predicates, {
+ self.assertSetEqual(set(lib.principals), {
ns.bse.filesize,
ns.bse.author,
})
- self.assertEqual(lib.schema, schema.Schema.from_string('''
- prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
- prefix xsd: <http://www.w3.org/2001/XMLSchema#>
- prefix bsfs: <http://bsfs.ai/schema/>
- prefix bse: <http://bsfs.ai/schema/Entity#>
- # essential nodes
- bsfs:Entity rdfs:subClassOf bsfs:Node .
- # common definitions
- xsd:string rdfs:subClassOf bsfs:Literal .
- xsd:integer rdfs:subClassOf bsfs:Literal .
-
+ self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
bse:filesize rdfs:subClassOf bsfs:Predicate ;
rdfs:domain bsfs:File ;
rdfs:range xsd:integer;
@@ -109,26 +87,15 @@ class TestBSIE(unittest.TestCase):
rdfs:domain bsfs:Entity ;
rdfs:range xsd:string ;
bsfs:unique "true"^^xsd:boolean .
-
'''))
# empty collect is disregarded
lib = BSIE(self.pipeline, collect={})
- self.assertSetEqual(lib.predicates, {
+ self.assertSetEqual(set(lib.principals), {
ns.bse.filename,
ns.bse.filesize,
ns.bse.author,
})
- self.assertEqual(lib.schema, schema.Schema.from_string('''
- prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
- prefix xsd: <http://www.w3.org/2001/XMLSchema#>
- prefix bsfs: <http://bsfs.ai/schema/>
- prefix bse: <http://bsfs.ai/schema/Entity#>
- # essential nodes
- bsfs:Entity rdfs:subClassOf bsfs:Node .
- # common definitions
- xsd:string rdfs:subClassOf bsfs:Literal .
- xsd:integer rdfs:subClassOf bsfs:Literal .
-
+ self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
bse:filename rdfs:subClassOf bsfs:Predicate ;
rdfs:domain bsfs:File ;
rdfs:range xsd:string ;
@@ -152,24 +119,14 @@ class TestBSIE(unittest.TestCase):
ns.bse.filename,
ns.bse.inexistent,
})
- self.assertSetEqual(lib.predicates, {
+ self.assertSetEqual(set(lib.principals), {
ns.bse.author,
})
- self.assertEqual(lib.schema, schema.Schema.from_string('''
- prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
- prefix xsd: <http://www.w3.org/2001/XMLSchema#>
- prefix bsfs: <http://bsfs.ai/schema/>
- prefix bse: <http://bsfs.ai/schema/Entity#>
- # essential nodes
- bsfs:Entity rdfs:subClassOf bsfs:Node .
- # common definitions
- xsd:string rdfs:subClassOf bsfs:Literal .
-
+ self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
bse:author rdfs:subClassOf bsfs:Predicate ;
rdfs:domain bsfs:Entity ;
rdfs:range xsd:string ;
bsfs:unique "true"^^xsd:boolean .
-
'''))
# specify collect and discard
@@ -177,19 +134,10 @@ class TestBSIE(unittest.TestCase):
collect={ns.bse.filesize, ns.bse.author, ns.bse.foo, ns.bse.bar},
discard={ns.bse.author, ns.bse.foo, ns.bse.foobar},
)
- self.assertSetEqual(lib.predicates, {
+ self.assertSetEqual(set(lib.principals), {
ns.bse.filesize,
})
- self.assertEqual(lib.schema, schema.Schema.from_string('''
- prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
- prefix xsd: <http://www.w3.org/2001/XMLSchema#>
- prefix bsfs: <http://bsfs.ai/schema/>
- prefix bse: <http://bsfs.ai/schema/Entity#>
- # essential nodes
- bsfs:Entity rdfs:subClassOf bsfs:Node .
- # common definitions
- xsd:integer rdfs:subClassOf bsfs:Literal .
-
+ self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
bse:filesize rdfs:subClassOf bsfs:Predicate ;
rdfs:domain bsfs:File ;
rdfs:range xsd:integer;
@@ -201,7 +149,7 @@ class TestBSIE(unittest.TestCase):
def test_from_file(self):
# setup
lib = BSIE(self.pipeline)
- self.assertSetEqual(set(lib.predicates), {
+ self.assertSetEqual(set(lib.principals), {
ns.bse.filesize,
ns.bse.filename,
ns.bse.author,
diff --git a/test/tools/test_pipeline.py b/test/tools/test_pipeline.py
index e440ab5..91bf736 100644
--- a/test/tools/test_pipeline.py
+++ b/test/tools/test_pipeline.py
@@ -75,7 +75,7 @@ class TestPipeline(unittest.TestCase):
# equivalence respects schema
p2 = Pipeline(self.prefix, self.ext2rdr)
- p2.schema = pipeline.schema.Empty()
+ p2._schema = pipeline.schema.Empty()
self.assertNotEqual(pipeline, p2)
self.assertNotEqual(hash(pipeline), hash(p2))
@@ -160,8 +160,7 @@ class TestPipeline(unittest.TestCase):
# build pipeline
pipeline = Pipeline(self.prefix, self.ext2rdr)
#
- self.assertSetEqual(set(pipeline.predicates()), {
- pipeline.schema.predicate(ns.bsfs.Predicate),
+ self.assertSetEqual(set(pipeline.principals), {
pipeline.schema.predicate(ns.bse.filename),
pipeline.schema.predicate(ns.bse.filesize),
pipeline.schema.predicate(ns.bse.author),