diff options
Diffstat (limited to 'bsie')
-rw-r--r-- | bsie/apps/index.py | 16 | ||||
-rw-r--r-- | bsie/apps/info.py | 1 | ||||
-rw-r--r-- | bsie/lib/__init__.py | 1 | ||||
-rw-r--r-- | bsie/lib/bsie.py | 10 | ||||
-rw-r--r-- | bsie/lib/builder.py | 9 | ||||
-rw-r--r-- | bsie/lib/naming_policy.py | 101 | ||||
-rw-r--r-- | bsie/lib/pipeline.py | 18 | ||||
-rw-r--r-- | bsie/utils/node.py | 29 |
8 files changed, 149 insertions, 36 deletions
diff --git a/bsie/apps/index.py b/bsie/apps/index.py index 21c2318..a870364 100644 --- a/bsie/apps/index.py +++ b/bsie/apps/index.py @@ -11,7 +11,7 @@ import typing # bsie imports from bsie.extractor import ExtractorBuilder -from bsie.lib import BSIE, PipelineBuilder +from bsie.lib import BSIE, PipelineBuilder, DefaultNamingPolicy from bsie.reader import ReaderBuilder from bsie.utils import bsfs, errors @@ -26,7 +26,9 @@ __all__: typing.Sequence[str] = ( def main(argv): """Index files or directories into BSFS.""" parser = argparse.ArgumentParser(description=main.__doc__, prog='index') - parser.add_argument('--user', type=bsfs.URI, default=bsfs.URI('http://example.com/me'), + parser.add_argument('--host', type=bsfs.URI, default=bsfs.URI('http://example.com'), + help='') + parser.add_argument('--user', type=str, default='me', help='') parser.add_argument('--collect', action='append', default=[], help='') @@ -66,16 +68,19 @@ def main(argv): ]) # pipeline builder pbuild = PipelineBuilder( - bsfs.Namespace(args.user + ('/' if not args.user.endswith('/') else '')), rbuild, ebuild, ) # build pipeline pipeline = pbuild.build() + # build the naming policy + naming_policy = DefaultNamingPolicy( + host=args.host, + user=args.user, + ) # build BSIE frontend - bsie = BSIE(pipeline, args.collect, args.discard) - + bsie = BSIE(pipeline, naming_policy, args.collect, args.discard) def walk(handle): """Walk through given input files.""" @@ -83,7 +88,6 @@ def main(argv): # FIXME: simplify code (below but maybe also above) # FIXME: How to handle dependencies between data? # E.g. do I still want to link to a tag despite not being permitted to set its label? - # FIXME: node renaming? # index input paths for path in args.input_file: diff --git a/bsie/apps/info.py b/bsie/apps/info.py index 64a4eba..4e948fc 100644 --- a/bsie/apps/info.py +++ b/bsie/apps/info.py @@ -54,7 +54,6 @@ def main(argv): ]) # pipeline builder pbuild = PipelineBuilder( - bsfs.Namespace('http://example.com/me/'), # not actually used rbuild, ebuild, ) diff --git a/bsie/lib/__init__.py b/bsie/lib/__init__.py index 4239d3b..48379de 100644 --- a/bsie/lib/__init__.py +++ b/bsie/lib/__init__.py @@ -10,6 +10,7 @@ import typing # inner-module imports from .bsie import BSIE from .builder import PipelineBuilder +from .naming_policy import DefaultNamingPolicy # exports __all__: typing.Sequence[str] = ( diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py index 668783d..a572525 100644 --- a/bsie/lib/bsie.py +++ b/bsie/lib/bsie.py @@ -11,6 +11,7 @@ import typing from bsie.utils import bsfs, node, ns # inner-module imports +from .naming_policy import NamingPolicy from .pipeline import Pipeline # exports @@ -41,15 +42,18 @@ class BSIE(): def __init__( self, - # pipeline builder. + # pipeline. pipeline: Pipeline, + # naming policy + naming_policy: NamingPolicy, # principals to extract at most. None implies all available w.r.t. extractors. collect: typing.Optional[typing.Iterable[bsfs.URI]] = None, # principals to discard. discard: typing.Optional[typing.Iterable[bsfs.URI]] = None, ): - # store pipeline + # store pipeline and naming policy self._pipeline = pipeline + self._naming_policy = naming_policy # start off with available principals self._principals = {pred.uri for pred in self._pipeline.principals} # limit principals to specified ones by argument. @@ -89,6 +93,6 @@ class BSIE(): # predicate lookup principals = {self.schema.predicate(pred) for pred in principals} # invoke pipeline - yield from self._pipeline(path, principals) + yield from self._naming_policy(self._pipeline(path, principals)) ## EOF ## diff --git a/bsie/lib/builder.py b/bsie/lib/builder.py index c2abffe..39da441 100644 --- a/bsie/lib/builder.py +++ b/bsie/lib/builder.py @@ -11,7 +11,7 @@ import typing # bsie imports from bsie.extractor import ExtractorBuilder from bsie.reader import ReaderBuilder -from bsie.utils import bsfs, errors +from bsie.utils import errors # inner-module imports from . import pipeline @@ -29,9 +29,6 @@ logger = logging.getLogger(__name__) class PipelineBuilder(): """Build `bsie.tools.pipeline.Pipeline` instances.""" - # Prefix to be used in the Pipeline. - prefix: bsfs.Namespace - # builder for Readers. rbuild: ReaderBuilder @@ -40,11 +37,9 @@ class PipelineBuilder(): def __init__( self, - prefix: bsfs.Namespace, reader_builder: ReaderBuilder, extractor_builder: ExtractorBuilder, ): - self.prefix = prefix self.rbuild = reader_builder self.ebuild = extractor_builder @@ -80,6 +75,6 @@ class PipelineBuilder(): except errors.BuilderError as err: # failed to build reader logger.error(str(err)) - return pipeline.Pipeline(self.prefix, ext2rdr) + return pipeline.Pipeline(ext2rdr) ## EOF ## diff --git a/bsie/lib/naming_policy.py b/bsie/lib/naming_policy.py new file mode 100644 index 0000000..360abde --- /dev/null +++ b/bsie/lib/naming_policy.py @@ -0,0 +1,101 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# standard imports +import abc +import os +import typing + +# bsie imports +from bsie.utils import bsfs, errors, ns +from bsie.utils.node import Node + +# exports +__all__: typing.Sequence[str] = ( + 'DefaultNamingPolicy', + ) + + +## code ## + +class NamingPolicy(): + """Determine node uri's from node hints.""" + def __call__( + self, + iterable: typing.Iterable[typing.Tuple[Node, bsfs.URI, typing.Any]], + ): + """Apply the policy on a triple iterator.""" + return NamingPolicyIterator(self, iterable) + + @abc.abstractmethod + def handle_node(self, node: Node) -> Node: + """Apply the policy on a node.""" + + +class NamingPolicyIterator(): + """Iterates over triples, determines uris according to a *policy* as it goes.""" + + # source triple iterator. + _iterable: typing.Iterable[typing.Tuple[Node, bsfs.URI, typing.Any]] + + # naming policy + _policy: NamingPolicy + + def __init__( + self, + policy: NamingPolicy, + iterable: typing.Iterable[typing.Tuple[Node, bsfs.URI, typing.Any]], + ): + self._iterable = iterable + self._policy = policy + + def __iter__(self): + for node, pred, value in self._iterable: + # handle subject + self._policy.handle_node(node) + # handle value + if isinstance(value, Node): + self._policy.handle_node(value) + # yield triple + yield node, pred, value + + +class DefaultNamingPolicy(NamingPolicy): + """Compose URIs as <host/user/node_type#fragment> + + What information is used as fragment depends on the node type. + Typically, the default is to use the "ucid" hint. + The fallback in all cases is to generate a random uuid. + + Never changes previously assigned uris. Sets uris in-place. + + """ + + def __init__( + self, + host: bsfs.URI, + user: str, + ): + self._prefix = bsfs.Namespace(os.path.join(host, user)) + self._uuid = bsfs.uuid.UUID() + + def handle_node(self, node: Node) -> Node: + if node.uri is not None: + return node + if node.node_type == ns.bsfs.File: + return self.name_file(node) + raise errors.ProgrammingError('no naming policy available for {node.node_type}') + + def name_file(self, node: Node) -> Node: + """Set a bsfs:File node's uri fragment to its ucid.""" + if 'ucid' in node.hints: # content id + fragment = node.hints['ucid'] + else: # random name + fragment = self._uuid() + node.uri = (self._prefix + 'file')[fragment] + return node + +## EOF ## diff --git a/bsie/lib/pipeline.py b/bsie/lib/pipeline.py index 44685ba..0bc5109 100644 --- a/bsie/lib/pipeline.py +++ b/bsie/lib/pipeline.py @@ -19,8 +19,6 @@ __all__: typing.Sequence[str] = ( 'Pipeline', ) -# constants -FILE_PREFIX = 'file#' ## code ## @@ -40,19 +38,14 @@ class Pipeline(): # combined extractor schemas. _schema: bsfs.schema.Schema - # node prefix. - _prefix: bsfs.Namespace - # extractor -> reader mapping _ext2rdr: typing.Dict[Extractor, typing.Optional[Reader]] def __init__( self, - prefix: bsfs.Namespace, ext2rdr: typing.Dict[Extractor, typing.Optional[Reader]] ): # store core members - self._prefix = prefix + FILE_PREFIX self._ext2rdr = ext2rdr # compile schema from all extractors self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr) @@ -64,12 +57,11 @@ class Pipeline(): return f'{bsfs.typename(self)}(...)' def __hash__(self) -> int: - return hash((type(self), self._prefix, self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values()))) + return hash((type(self), self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values()))) def __eq__(self, other: typing.Any) -> bool: return isinstance(other, type(self)) \ and self._schema == other._schema \ - and self._prefix == other._prefix \ and self._ext2rdr == other._ext2rdr @property @@ -117,8 +109,9 @@ class Pipeline(): rdr2ext[rdr].add(ext) # create subject for file - uuid = bsfs.uuid.UCID.from_path(path) - subject = node.Node(ns.bsfs.File, self._prefix[uuid]) + subject = node.Node(ns.bsfs.File, + ucid=bsfs.uuid.UCID.from_path(path), + ) # extract information for rdr, extrs in rdr2ext.items(): @@ -131,8 +124,7 @@ class Pipeline(): for ext in extrs: try: # get predicate/value tuples - for subject, pred, value in ext.extract(subject, content, principals): - yield subject, pred, value + yield from ext.extract(subject, content, principals) except errors.ExtractorError as err: # critical extractor failure. diff --git a/bsie/utils/node.py b/bsie/utils/node.py index 91e4f37..aa62c06 100644 --- a/bsie/utils/node.py +++ b/bsie/utils/node.py @@ -19,30 +19,47 @@ __all__: typing.Sequence[str] = ( ## code ## class Node(): - """Lightweight Node, disconnected from any bsfs structures.""" + """Lightweight Node, disconnected from any bsfs structures. + + In most cases, provide *hints* and leave setting the uri to a node + naming policy. Only provide an *uri* if it is absolutely determined. + + """ # node type. node_type: bsfs.URI # node URI. - uri: bsfs.URI + uri: typing.Optional[bsfs.URI] + + # node naming hints. + hits: dict def __init__( self, node_type: bsfs.URI, - uri: bsfs.URI, + uri: typing.Optional[bsfs.URI] = None, + **uri_hints, ): # assign members self.node_type = bsfs.URI(node_type) - self.uri = bsfs.URI(uri) + self.hints = uri_hints + self.uri = uri def __eq__(self, other: typing.Any) -> bool: + """Compare two Node instances based on type and uri. + Compares hits only if the uri is not yet specified. + """ return isinstance(other, Node) \ and other.node_type == self.node_type \ - and other.uri == self.uri + and other.uri == self.uri \ + and (self.uri is not None or self.hints == other.hints) def __hash__(self) -> int: - return hash((type(self), self.node_type, self.uri)) + identifier = self.uri + if identifier is None: + identifier = tuple((key, self.hints[key]) for key in sorted(self.hints)) + return hash((type(self), self.node_type, identifier)) def __str__(self) -> str: return f'{bsfs.typename(self)}({self.node_type}, {self.uri})' |