aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/lib
diff options
context:
space:
mode:
Diffstat (limited to 'bsie/lib')
-rw-r--r--bsie/lib/__init__.py1
-rw-r--r--bsie/lib/bsie.py10
-rw-r--r--bsie/lib/builder.py9
-rw-r--r--bsie/lib/naming_policy.py120
-rw-r--r--bsie/lib/pipeline.py18
5 files changed, 135 insertions, 23 deletions
diff --git a/bsie/lib/__init__.py b/bsie/lib/__init__.py
index 4239d3b..48379de 100644
--- a/bsie/lib/__init__.py
+++ b/bsie/lib/__init__.py
@@ -10,6 +10,7 @@ import typing
# inner-module imports
from .bsie import BSIE
from .builder import PipelineBuilder
+from .naming_policy import DefaultNamingPolicy
# exports
__all__: typing.Sequence[str] = (
diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py
index 668783d..a572525 100644
--- a/bsie/lib/bsie.py
+++ b/bsie/lib/bsie.py
@@ -11,6 +11,7 @@ import typing
from bsie.utils import bsfs, node, ns
# inner-module imports
+from .naming_policy import NamingPolicy
from .pipeline import Pipeline
# exports
@@ -41,15 +42,18 @@ class BSIE():
def __init__(
self,
- # pipeline builder.
+ # pipeline.
pipeline: Pipeline,
+ # naming policy
+ naming_policy: NamingPolicy,
# principals to extract at most. None implies all available w.r.t. extractors.
collect: typing.Optional[typing.Iterable[bsfs.URI]] = None,
# principals to discard.
discard: typing.Optional[typing.Iterable[bsfs.URI]] = None,
):
- # store pipeline
+ # store pipeline and naming policy
self._pipeline = pipeline
+ self._naming_policy = naming_policy
# start off with available principals
self._principals = {pred.uri for pred in self._pipeline.principals}
# limit principals to specified ones by argument.
@@ -89,6 +93,6 @@ class BSIE():
# predicate lookup
principals = {self.schema.predicate(pred) for pred in principals}
# invoke pipeline
- yield from self._pipeline(path, principals)
+ yield from self._naming_policy(self._pipeline(path, principals))
## EOF ##
diff --git a/bsie/lib/builder.py b/bsie/lib/builder.py
index c2abffe..39da441 100644
--- a/bsie/lib/builder.py
+++ b/bsie/lib/builder.py
@@ -11,7 +11,7 @@ import typing
# bsie imports
from bsie.extractor import ExtractorBuilder
from bsie.reader import ReaderBuilder
-from bsie.utils import bsfs, errors
+from bsie.utils import errors
# inner-module imports
from . import pipeline
@@ -29,9 +29,6 @@ logger = logging.getLogger(__name__)
class PipelineBuilder():
"""Build `bsie.tools.pipeline.Pipeline` instances."""
- # Prefix to be used in the Pipeline.
- prefix: bsfs.Namespace
-
# builder for Readers.
rbuild: ReaderBuilder
@@ -40,11 +37,9 @@ class PipelineBuilder():
def __init__(
self,
- prefix: bsfs.Namespace,
reader_builder: ReaderBuilder,
extractor_builder: ExtractorBuilder,
):
- self.prefix = prefix
self.rbuild = reader_builder
self.ebuild = extractor_builder
@@ -80,6 +75,6 @@ class PipelineBuilder():
except errors.BuilderError as err: # failed to build reader
logger.error(str(err))
- return pipeline.Pipeline(self.prefix, ext2rdr)
+ return pipeline.Pipeline(ext2rdr)
## EOF ##
diff --git a/bsie/lib/naming_policy.py b/bsie/lib/naming_policy.py
new file mode 100644
index 0000000..131a70b
--- /dev/null
+++ b/bsie/lib/naming_policy.py
@@ -0,0 +1,120 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# standard imports
+import abc
+import os
+import typing
+
+# bsie imports
+from bsie.utils import bsfs, errors, ns
+from bsie.utils.node import Node
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'DefaultNamingPolicy',
+ )
+
+
+## code ##
+
+class NamingPolicy():
+ """Determine node uri's from node hints."""
+ def __call__(
+ self,
+ iterable: typing.Iterable[typing.Tuple[Node, bsfs.URI, typing.Any]],
+ ):
+ """Apply the policy on a triple iterator."""
+ return NamingPolicyIterator(self, iterable)
+
+ @abc.abstractmethod
+ def handle_node(self, node: Node) -> Node:
+ """Apply the policy on a node."""
+
+
+class NamingPolicyIterator():
+ """Iterates over triples, determines uris according to a *policy* as it goes."""
+
+ # source triple iterator.
+ _iterable: typing.Iterable[typing.Tuple[Node, bsfs.URI, typing.Any]]
+
+ # naming policy
+ _policy: NamingPolicy
+
+ def __init__(
+ self,
+ policy: NamingPolicy,
+ iterable: typing.Iterable[typing.Tuple[Node, bsfs.URI, typing.Any]],
+ ):
+ self._iterable = iterable
+ self._policy = policy
+
+ def __iter__(self):
+ for node, pred, value in self._iterable:
+ # handle subject
+ self._policy.handle_node(node)
+ # handle value
+ if isinstance(value, Node):
+ self._policy.handle_node(value)
+ # yield triple
+ yield node, pred, value
+
+
+class DefaultNamingPolicy(NamingPolicy):
+ """Compose URIs as <host/user/node_type#fragment>
+
+ What information is used as fragment depends on the node type.
+ Typically, the default is to use the "ucid" hint.
+ The fallback in all cases is to generate a random uuid.
+
+ Never changes previously assigned uris. Sets uris in-place.
+
+ """
+
+ def __init__(
+ self,
+ host: bsfs.URI,
+ user: str,
+ ):
+ self._prefix = bsfs.Namespace(os.path.join(host, user))
+ self._uuid = bsfs.uuid.UUID()
+
+ def handle_node(self, node: Node) -> Node:
+ if node.uri is not None:
+ return node
+ if node.node_type == ns.bsfs.File:
+ return self.name_file(node)
+ if node.node_type == ns.bsfs.Preview:
+ return self.name_preview(node)
+ raise errors.ProgrammingError('no naming policy available for {node.node_type}')
+
+ def name_file(self, node: Node) -> Node:
+ """Set a bsfs:File node's uri fragment to its ucid."""
+ if 'ucid' in node.hints: # content id
+ fragment = node.hints['ucid']
+ else: # random name
+ fragment = self._uuid()
+ node.uri = (self._prefix + 'file')[fragment]
+ return node
+
+ def name_preview(self, node: Node) -> Node:
+ """Set a bsfs:Preview node's uri fragment to its ucid.
+ Uses its source fragment as fallback. Appends the size if provided.
+ """
+ fragment = None
+ if 'ucid' in node.hints: # content id
+ fragment = node.hints['ucid']
+ if fragment is None and 'source' in node.hints: # source id
+ self.handle_node(node.hints['source'])
+ fragment = node.hints['source'].uri.get('fragment', None)
+ if fragment is None: # random name
+ fragment = self._uuid()
+ if 'size' in node.hints: # append size
+ fragment += '_s' + str(node.hints['size'])
+ node.uri = (self._prefix + 'preview')[fragment]
+ return node
+
+## EOF ##
diff --git a/bsie/lib/pipeline.py b/bsie/lib/pipeline.py
index 44685ba..0bc5109 100644
--- a/bsie/lib/pipeline.py
+++ b/bsie/lib/pipeline.py
@@ -19,8 +19,6 @@ __all__: typing.Sequence[str] = (
'Pipeline',
)
-# constants
-FILE_PREFIX = 'file#'
## code ##
@@ -40,19 +38,14 @@ class Pipeline():
# combined extractor schemas.
_schema: bsfs.schema.Schema
- # node prefix.
- _prefix: bsfs.Namespace
-
# extractor -> reader mapping
_ext2rdr: typing.Dict[Extractor, typing.Optional[Reader]]
def __init__(
self,
- prefix: bsfs.Namespace,
ext2rdr: typing.Dict[Extractor, typing.Optional[Reader]]
):
# store core members
- self._prefix = prefix + FILE_PREFIX
self._ext2rdr = ext2rdr
# compile schema from all extractors
self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr)
@@ -64,12 +57,11 @@ class Pipeline():
return f'{bsfs.typename(self)}(...)'
def __hash__(self) -> int:
- return hash((type(self), self._prefix, self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values())))
+ return hash((type(self), self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values())))
def __eq__(self, other: typing.Any) -> bool:
return isinstance(other, type(self)) \
and self._schema == other._schema \
- and self._prefix == other._prefix \
and self._ext2rdr == other._ext2rdr
@property
@@ -117,8 +109,9 @@ class Pipeline():
rdr2ext[rdr].add(ext)
# create subject for file
- uuid = bsfs.uuid.UCID.from_path(path)
- subject = node.Node(ns.bsfs.File, self._prefix[uuid])
+ subject = node.Node(ns.bsfs.File,
+ ucid=bsfs.uuid.UCID.from_path(path),
+ )
# extract information
for rdr, extrs in rdr2ext.items():
@@ -131,8 +124,7 @@ class Pipeline():
for ext in extrs:
try:
# get predicate/value tuples
- for subject, pred, value in ext.extract(subject, content, principals):
- yield subject, pred, value
+ yield from ext.extract(subject, content, principals)
except errors.ExtractorError as err:
# critical extractor failure.