aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/lib
diff options
context:
space:
mode:
Diffstat (limited to 'bsie/lib')
-rw-r--r--bsie/lib/__init__.py4
-rw-r--r--bsie/lib/bsie.py6
-rw-r--r--bsie/lib/builder.py85
-rw-r--r--bsie/lib/pipeline.py145
4 files changed, 237 insertions, 3 deletions
diff --git a/bsie/lib/__init__.py b/bsie/lib/__init__.py
index 578c2c4..4239d3b 100644
--- a/bsie/lib/__init__.py
+++ b/bsie/lib/__init__.py
@@ -4,15 +4,17 @@ Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import typing
# inner-module imports
from .bsie import BSIE
+from .builder import PipelineBuilder
# exports
__all__: typing.Sequence[str] = (
'BSIE',
+ 'PipelineBuilder',
)
## EOF ##
diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py
index e087fa9..668783d 100644
--- a/bsie/lib/bsie.py
+++ b/bsie/lib/bsie.py
@@ -4,13 +4,15 @@ Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import typing
# bsie imports
-from bsie.tools import Pipeline
from bsie.utils import bsfs, node, ns
+# inner-module imports
+from .pipeline import Pipeline
+
# exports
__all__: typing.Sequence[str] = (
'BSIE',
diff --git a/bsie/lib/builder.py b/bsie/lib/builder.py
new file mode 100644
index 0000000..c2abffe
--- /dev/null
+++ b/bsie/lib/builder.py
@@ -0,0 +1,85 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# standard imports
+import logging
+import typing
+
+# bsie imports
+from bsie.extractor import ExtractorBuilder
+from bsie.reader import ReaderBuilder
+from bsie.utils import bsfs, errors
+
+# inner-module imports
+from . import pipeline
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'PipelineBuilder',
+ )
+
+
+## code ##
+
+logger = logging.getLogger(__name__)
+
+class PipelineBuilder():
+ """Build `bsie.tools.pipeline.Pipeline` instances."""
+
+ # Prefix to be used in the Pipeline.
+ prefix: bsfs.Namespace
+
+ # builder for Readers.
+ rbuild: ReaderBuilder
+
+ # builder for Extractors.
+ ebuild: ExtractorBuilder
+
+ def __init__(
+ self,
+ prefix: bsfs.Namespace,
+ reader_builder: ReaderBuilder,
+ extractor_builder: ExtractorBuilder,
+ ):
+ self.prefix = prefix
+ self.rbuild = reader_builder
+ self.ebuild = extractor_builder
+
+ def build(self) -> pipeline.Pipeline:
+ """Return a Pipeline instance."""
+ ext2rdr = {}
+
+ for eidx in self.ebuild:
+ # build extractor
+ try:
+ ext = self.ebuild.build(eidx)
+
+ except errors.LoaderError as err: # failed to load extractor; skip
+ logger.error('failed to load extractor: %s', err)
+ continue
+
+ except errors.BuilderError as err: # failed to build instance; skip
+ logger.error(str(err))
+ continue
+
+ try:
+ # get reader required by extractor
+ if ext.CONTENT_READER is not None:
+ rdr = self.rbuild.build(ext.CONTENT_READER)
+ else:
+ rdr = None
+ # store extractor
+ ext2rdr[ext] = rdr
+
+ except errors.LoaderError as err: # failed to load reader
+ logger.error('failed to load reader: %s', err)
+
+ except errors.BuilderError as err: # failed to build reader
+ logger.error(str(err))
+
+ return pipeline.Pipeline(self.prefix, ext2rdr)
+
+## EOF ##
diff --git a/bsie/lib/pipeline.py b/bsie/lib/pipeline.py
new file mode 100644
index 0000000..e5ce1b7
--- /dev/null
+++ b/bsie/lib/pipeline.py
@@ -0,0 +1,145 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# standard imports
+from collections import defaultdict
+import logging
+import typing
+
+# bsie imports
+from bsie.extractor import Extractor
+from bsie.reader import Reader
+from bsie.utils import bsfs, errors, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Pipeline',
+ )
+
+# constants
+FILE_PREFIX = 'file#'
+
+## code ##
+
+logger = logging.getLogger(__name__)
+
+class Pipeline():
+ """Extraction pipeline to generate triples from files.
+
+ The Pipeline binds readers and extractors, and performs
+ the necessary operations to produce triples from a file.
+ It takes a best-effort approach to extract as many triples
+ as possible. Errors during the extraction are passed over
+ and reported to the log.
+
+ """
+
+ # combined extractor schemas.
+ _schema: bsfs.schema.Schema
+
+ # node prefix.
+ _prefix: bsfs.Namespace
+
+ # extractor -> reader mapping
+ _ext2rdr: typing.Dict[Extractor, typing.Optional[Reader]]
+
+ def __init__(
+ self,
+ prefix: bsfs.Namespace,
+ ext2rdr: typing.Dict[Extractor, typing.Optional[Reader]]
+ ):
+ # store core members
+ self._prefix = prefix + FILE_PREFIX
+ self._ext2rdr = ext2rdr
+ # compile schema from all extractors
+ self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr)
+
+ def __str__(self) -> str:
+ return bsfs.typename(self)
+
+ def __repr__(self) -> str:
+ return f'{bsfs.typename(self)}(...)'
+
+ def __hash__(self) -> int:
+ return hash((type(self), self._prefix, self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values())))
+
+ def __eq__(self, other: typing.Any) -> bool:
+ return isinstance(other, type(self)) \
+ and self._schema == other._schema \
+ and self._prefix == other._prefix \
+ and self._ext2rdr == other._ext2rdr
+
+ @property
+ def schema(self) -> bsfs.schema.Schema:
+ """Return the pipeline's schema (combined from all extractors)."""
+ return self._schema
+
+ @property
+ def principals(self) -> typing.Iterator[bsfs.schema.Predicate]:
+ """Return the principal predicates that can be extracted."""
+ return iter({pred for ext in self._ext2rdr for pred in ext.principals})
+
+ def subschema(self, principals: typing.Iterable[bsfs.schema.Predicate]) -> bsfs.schema.Schema:
+ """Return the subset of the schema that supports the given *principals*."""
+ # materialize principals
+ principals = set(principals)
+ # collect and combine schemas from extractors
+ return bsfs.schema.Schema.Union({
+ ext.schema
+ for ext
+ in self._ext2rdr
+ if not set(ext.principals).isdisjoint(principals)
+ })
+
+ def __call__(
+ self,
+ path: bsfs.URI,
+ principals: typing.Optional[typing.Iterable[bsfs.schema.Predicate]] = None,
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ """Extract triples from the file at *path*. Optionally, limit triples to *principals*."""
+ # get principals
+ principals = set(principals) if principals is not None else set(self.schema.predicates())
+
+ # get extractors
+ extractors = {ext for ext in self._ext2rdr if not set(ext.principals).isdisjoint(principals)}
+
+ # corner-case short-cut
+ if len(extractors) == 0:
+ return
+
+ # get readers -> extractors mapping
+ rdr2ext = defaultdict(set)
+ for ext in extractors:
+ rdr = self._ext2rdr[ext]
+ rdr2ext[rdr].add(ext)
+
+ # create subject for file
+ uuid = bsfs.uuid.UCID.from_path(path)
+ subject = node.Node(ns.bsfs.File, self._prefix[uuid])
+
+ # extract information
+ for rdr, extrs in rdr2ext.items():
+ try:
+ # get content
+ content = rdr(path) if rdr is not None else None
+
+ # apply extractors on this content
+ for ext in extrs:
+ try:
+ # get predicate/value tuples
+ for subject, pred, value in ext.extract(subject, content, principals):
+ yield subject, pred, value
+
+ except errors.ExtractorError as err:
+ # critical extractor failure.
+ logger.error('%s failed to extract triples from content: %s', ext, err)
+
+ except errors.ReaderError as err:
+ # failed to read any content. skip.
+ logger.error('%s failed to read content: %s', rdr, err)
+
+
+## EOF ##