aboutsummaryrefslogtreecommitdiffstats
path: root/bsie
diff options
context:
space:
mode:
Diffstat (limited to 'bsie')
-rw-r--r--bsie/__init__.py6
-rw-r--r--bsie/apps/__init__.py46
-rw-r--r--bsie/apps/_loader.py47
-rw-r--r--bsie/apps/default_config.yaml19
-rw-r--r--bsie/apps/index.py65
-rw-r--r--bsie/apps/info.py49
-rw-r--r--bsie/base/__init__.py24
-rw-r--r--bsie/extractor/__init__.py14
-rw-r--r--bsie/extractor/base.py (renamed from bsie/base/extractor.py)35
-rw-r--r--bsie/extractor/builder.py72
-rw-r--r--bsie/extractor/generic/__init__.py5
-rw-r--r--bsie/extractor/generic/constant.py14
-rw-r--r--bsie/extractor/generic/path.py17
-rw-r--r--bsie/extractor/generic/stat.py18
-rw-r--r--bsie/extractor/image/__init__.py8
-rw-r--r--bsie/extractor/image/colors_spatial.py150
-rw-r--r--bsie/extractor/image/photometrics.py211
-rw-r--r--bsie/extractor/preview.py96
-rw-r--r--bsie/lib/__init__.py10
-rw-r--r--bsie/lib/bsie.py21
-rw-r--r--bsie/lib/builder.py75
-rw-r--r--bsie/lib/naming_policy.py115
-rw-r--r--bsie/lib/pipeline.py (renamed from bsie/tools/pipeline.py)44
-rw-r--r--bsie/reader/__init__.py19
-rw-r--r--bsie/reader/base.py (renamed from bsie/base/reader.py)13
-rw-r--r--bsie/reader/builder.py73
-rw-r--r--bsie/reader/chain.py86
-rw-r--r--bsie/reader/exif.py44
-rw-r--r--bsie/reader/image/__init__.py31
-rw-r--r--bsie/reader/image/_pillow.py34
-rw-r--r--bsie/reader/image/_raw.py56
-rw-r--r--bsie/reader/path.py12
-rw-r--r--bsie/reader/preview/__init__.py34
-rw-r--r--bsie/reader/preview/_pg.py81
-rw-r--r--bsie/reader/preview/_pillow.py39
-rw-r--r--bsie/reader/preview/_rawpy.py61
-rw-r--r--bsie/reader/preview/utils.py34
-rw-r--r--bsie/reader/stat.py13
-rw-r--r--bsie/tools/__init__.py20
-rw-r--r--bsie/tools/builder.py226
-rw-r--r--bsie/utils/__init__.py11
-rw-r--r--bsie/utils/bsfs.py6
-rw-r--r--bsie/utils/errors.py (renamed from bsie/base/errors.py)12
-rw-r--r--bsie/utils/filematcher/__init__.py15
-rw-r--r--bsie/utils/filematcher/matcher.py174
-rw-r--r--bsie/utils/filematcher/parser.py141
-rw-r--r--bsie/utils/loading.py49
-rw-r--r--bsie/utils/namespaces.py33
-rw-r--r--bsie/utils/node.py35
49 files changed, 1998 insertions, 515 deletions
diff --git a/bsie/__init__.py b/bsie/__init__.py
index 8d2308c..f6f2ff2 100644
--- a/bsie/__init__.py
+++ b/bsie/__init__.py
@@ -1,10 +1,6 @@
"""The BSIE module extracts triples from files for insertion into a BSFS storage.
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import collections
import typing
diff --git a/bsie/apps/__init__.py b/bsie/apps/__init__.py
index a548c3c..2fe4795 100644
--- a/bsie/apps/__init__.py
+++ b/bsie/apps/__init__.py
@@ -1,12 +1,13 @@
+#!/usr/bin/env python3
+"""BSIE tools.
"""
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
-"""
-# imports
+# standard imports
+import argparse
import typing
+# bsie imports
+import bsie
+
# inner-module imports
from .index import main as index
from .info import main as info
@@ -15,6 +16,39 @@ from .info import main as info
__all__: typing.Sequence[str] = (
'index',
'info',
+ 'main',
)
+# config
+apps = {
+ 'index' : index,
+ 'info' : info,
+ }
+
+
+## code ##
+
+def main(argv=None):
+ """Black Star File System maintenance tools."""
+ parser = argparse.ArgumentParser(description=main.__doc__, prog='bsie')
+ # version
+ parser.add_argument('--version', action='version',
+ version='%(prog)s version {}.{}.{}'.format(*bsie.version_info)) # pylint: disable=C0209
+ # application selection
+ parser.add_argument('app', choices=apps.keys(),
+ help='Select the application to run.')
+ # dangling args
+ parser.add_argument('rest', nargs=argparse.REMAINDER)
+ # parse
+ args = parser.parse_args(argv)
+ # run application
+ apps[args.app](args.rest)
+
+
+## main ##
+
+if __name__ == '__main__':
+ import sys
+ main(sys.argv[1:])
+
## EOF ##
diff --git a/bsie/apps/_loader.py b/bsie/apps/_loader.py
new file mode 100644
index 0000000..6411f10
--- /dev/null
+++ b/bsie/apps/_loader.py
@@ -0,0 +1,47 @@
+
+# standard imports
+import typing
+
+# external imports
+import yaml
+
+# bsie imports
+from bsie.extractor import ExtractorBuilder
+from bsie.lib import PipelineBuilder
+from bsie.lib.pipeline import Pipeline
+from bsie.reader import ReaderBuilder
+
+# constants
+DEFAULT_CONFIG_FILE = 'default_config.yaml'
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'DEFAULT_CONFIG_FILE',
+ 'load_pipeline',
+ )
+
+
+## code ##
+
+def load_pipeline(path: str) -> Pipeline:
+ """Load a pipeline according to a config at *path*."""
+ # load config file
+ with open(path, 'rt', encoding='utf-8') as ifile:
+ cfg = yaml.safe_load(ifile)
+
+ # reader builder
+ rbuild = ReaderBuilder(cfg['ReaderBuilder'])
+ # extractor builder
+ ebuild = ExtractorBuilder(cfg['ExtractorBuilder'])
+ # pipeline builder
+ pbuild = PipelineBuilder(
+ rbuild,
+ ebuild,
+ )
+ # build pipeline
+ pipeline = pbuild.build()
+
+ # return pipeline
+ return pipeline
+
+## EOF ##
diff --git a/bsie/apps/default_config.yaml b/bsie/apps/default_config.yaml
new file mode 100644
index 0000000..a59b0f3
--- /dev/null
+++ b/bsie/apps/default_config.yaml
@@ -0,0 +1,19 @@
+
+ReaderBuilder: {}
+
+ExtractorBuilder:
+
+ - bsie.extractor.preview.Preview:
+ max_sides: [50, 100, 200,400]
+
+ - bsie.extractor.generic.path.Path: {}
+
+ - bsie.extractor.generic.stat.Stat: {}
+
+ - bsie.extractor.image.colors_spatial.ColorsSpatial:
+ width: 32
+ height: 32
+ exp: 4
+
+ - bsie.extractor.image.photometrics.Exif: {}
+
diff --git a/bsie/apps/index.py b/bsie/apps/index.py
index 1dbfdd8..d64e8c2 100644
--- a/bsie/apps/index.py
+++ b/bsie/apps/index.py
@@ -1,19 +1,15 @@
-"""
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
-"""
-# imports
+# standard imports
import argparse
import os
import typing
# bsie imports
-from bsie.base import errors
-from bsie.lib import BSIE
-from bsie.tools import builder
-from bsie.utils import bsfs
+from bsie.lib import BSIE, DefaultNamingPolicy
+from bsie.utils import bsfs, errors, node as node_
+
+# inner-module imports
+from . import _loader
# exports
__all__: typing.Sequence[str] = (
@@ -26,7 +22,12 @@ __all__: typing.Sequence[str] = (
def main(argv):
"""Index files or directories into BSFS."""
parser = argparse.ArgumentParser(description=main.__doc__, prog='index')
- parser.add_argument('--user', type=bsfs.URI, default=bsfs.URI('http://example.com/me'),
+ parser.add_argument('--config', type=str,
+ default=os.path.join(os.path.dirname(__file__), _loader.DEFAULT_CONFIG_FILE),
+ help='Path to the config file.')
+ parser.add_argument('--host', type=bsfs.URI, default=bsfs.URI('http://example.com'),
+ help='')
+ parser.add_argument('--user', type=str, default='me',
help='')
parser.add_argument('--collect', action='append', default=[],
help='')
@@ -42,35 +43,15 @@ def main(argv):
help='')
args = parser.parse_args(argv)
- # FIXME: Read reader/extractor configs from a config file
- # reader builder
- rbuild = builder.ReaderBuilder({})
- # extractor builder
- ebuild = builder.ExtractorBuilder([
- {'bsie.extractor.generic.path.Path': {}},
- {'bsie.extractor.generic.stat.Stat': {}},
- {'bsie.extractor.generic.constant.Constant': dict(
- tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
- schema='''
- bse:author rdfs:subClassOf bsfs:Predicate ;
- rdfs:domain bsfs:Entity ;
- rdfs:range xsd:string ;
- bsfs:unique "true"^^xsd:boolean .
- ''',
- )},
- ])
- # pipeline builder
- pbuild = builder.PipelineBuilder(
- bsfs.Namespace(args.user + ('/' if not args.user.endswith('/') else '')),
- rbuild,
- ebuild,
- )
-
# build pipeline
- pipeline = pbuild.build()
+ pipeline = _loader.load_pipeline(args.config)
+ # build the naming policy
+ naming_policy = DefaultNamingPolicy(
+ host=args.host,
+ user=args.user,
+ )
# build BSIE frontend
- bsie = BSIE(pipeline, args.collect, args.discard)
-
+ bsie = BSIE(pipeline, naming_policy, args.collect, args.discard)
def walk(handle):
"""Walk through given input files."""
@@ -78,11 +59,12 @@ def main(argv):
# FIXME: simplify code (below but maybe also above)
# FIXME: How to handle dependencies between data?
# E.g. do I still want to link to a tag despite not being permitted to set its label?
- # FIXME: node renaming?
# index input paths
for path in args.input_file:
- if os.path.isdir(path) and args.recursive:
+ if not os.path.exists(path):
+ pass # FIXME: notify the user
+ elif os.path.isdir(path) and args.recursive:
for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow):
for filename in filenames:
for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)):
@@ -105,13 +87,14 @@ def main(argv):
store.migrate(bsie.schema)
# process files
def handle(node, pred, value):
+ if isinstance(value, node_.Node):
+ value = store.node(value.node_type, value.uri)
store.node(node.node_type, node.uri).set(pred.uri, value)
walk(handle)
# return store
return store
-
## main ##
if __name__ == '__main__':
diff --git a/bsie/apps/info.py b/bsie/apps/info.py
index eaf1f71..e27b70b 100644
--- a/bsie/apps/info.py
+++ b/bsie/apps/info.py
@@ -1,18 +1,15 @@
-"""
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
-"""
-# imports
+# standard imports
import argparse
+import os
import sys
import typing
# bsie imports
-from bsie.base import errors
-from bsie.tools import builder
-from bsie.utils import bsfs
+from bsie.utils import bsfs, errors
+
+# inner-module imports
+from . import _loader
# exports
__all__: typing.Sequence[str] = (
@@ -25,42 +22,24 @@ __all__: typing.Sequence[str] = (
def main(argv):
"""Show information from BSIE."""
parser = argparse.ArgumentParser(description=main.__doc__, prog='info')
- parser.add_argument('what', choices=('predicates', ),
+ parser.add_argument('--config', type=str,
+ default=os.path.join(os.path.dirname(__file__), _loader.DEFAULT_CONFIG_FILE),
+ help='Path to the config file.')
+ parser.add_argument('what', choices=('predicates', 'schema'),
help='Select what information to show.')
args = parser.parse_args(argv)
- # FIXME: Read reader/extractor configs from a config file
- # reader builder
- rbuild = builder.ReaderBuilder({})
- # extractor builder
- ebuild = builder.ExtractorBuilder([
- {'bsie.extractor.generic.path.Path': {}},
- {'bsie.extractor.generic.stat.Stat': {}},
- {'bsie.extractor.generic.constant.Constant': dict(
- tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
- schema='''
- bse:author rdfs:subClassOf bsfs:Predicate ;
- rdfs:domain bsfs:Entity ;
- rdfs:range xsd:string ;
- bsfs:unique "true"^^xsd:boolean .
- ''',
- )},
- ])
- # pipeline builder
- pbuild = builder.PipelineBuilder(
- bsfs.Namespace('http://example.com/me/'), # not actually used
- rbuild,
- ebuild,
- )
-
# build pipeline
- pipeline = pbuild.build()
+ pipeline = _loader.load_pipeline(args.config)
# show info
if args.what == 'predicates':
# show predicates
for pred in pipeline.schema.predicates():
print(pred.uri)
+ elif args.what == 'schema':
+ # show schema
+ print(bsfs.schema.to_string(pipeline.schema))
else:
# args.what is already checked by argparse
raise errors.UnreachableError()
diff --git a/bsie/base/__init__.py b/bsie/base/__init__.py
deleted file mode 100644
index 0d362cd..0000000
--- a/bsie/base/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-"""The base module defines the BSIE interfaces.
-
-You'll mostly find abstract classes here.
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
-"""
-# imports
-import typing
-
-# inner-module imports
-from . import errors
-from .extractor import Extractor
-from .reader import Reader
-
-# exports
-__all__: typing.Sequence[str] = (
- 'Extractor',
- 'Reader',
- 'errors',
- )
-
-## EOF ##
diff --git a/bsie/extractor/__init__.py b/bsie/extractor/__init__.py
index ef31343..36fa9ba 100644
--- a/bsie/extractor/__init__.py
+++ b/bsie/extractor/__init__.py
@@ -2,14 +2,18 @@
Each Extractor class is linked to the Reader class whose content it requires.
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import typing
+# inner-module imports
+from .base import Extractor
+from .builder import ExtractorBuilder
+
# exports
-__all__: typing.Sequence[str] = []
+__all__: typing.Sequence[str] = (
+ 'Extractor',
+ 'ExtractorBuilder',
+ )
## EOF ##
diff --git a/bsie/base/extractor.py b/bsie/extractor/base.py
index c44021b..f92d7cc 100644
--- a/bsie/base/extractor.py
+++ b/bsie/extractor/base.py
@@ -1,10 +1,6 @@
"""The Extractor classes transform content into triples.
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import abc
import typing
@@ -28,16 +24,32 @@ SCHEMA_PREAMBLE = '''
prefix schema: <http://schema.org/>
# common bsfs prefixes
- prefix bsfs: <http://bsfs.ai/schema/>
- prefix bse: <http://bsfs.ai/schema/Entity#>
+ prefix bsfs: <https://schema.bsfs.io/core/>
+ prefix bsl: <https://schema.bsfs.io/core/Literal/>
+ prefix bsa: <https://schema.bsfs.io/core/Literal/Array/>
+ prefix bsd: <https://schema.bsfs.io/core/distance#>
+
+ prefix bsie: <https://schema.bsfs.io/ie/>
+ prefix bsn: <https://schema.bsfs.io/ie/Node/>
+ prefix bse: <https://schema.bsfs.io/ie/Node/Entity#>
+ prefix bsp: <https://schema.bsfs.io/ie/Node/Preview#>
+
+ # default definitions
+ bsl:Array rdfs:subClassOf bsfs:Literal .
+ bsl:Number rdfs:subClassOf bsfs:Literal .
+ bsl:Time rdfs:subClassOf bsfs:Literal .
+ bsa:Feature rdfs:subClassOf bsl:Array ;
+ bsfs:dimension "1"^^xsd:integer ;
+ bsfs:dtype <https://schema.bsfs.io/core/dtype#f16> ;
+ bsfs:distance bsd:euclidean .
# essential nodes
- bsfs:Entity rdfs:subClassOf bsfs:Node .
- bsfs:File rdfs:subClassOf bsfs:Entity .
+ bsn:Entity rdfs:subClassOf bsfs:Node .
# common definitions
xsd:string rdfs:subClassOf bsfs:Literal .
- xsd:integer rdfs:subClassOf bsfs:Literal .
+ xsd:integer rdfs:subClassOf bsl:Number .
+ xsd:float rdfs:subClassOf bsl:Number .
'''
@@ -83,7 +95,7 @@ class Extractor(abc.ABC):
@property
def principals(self) -> typing.Iterator[bsfs.schema.Predicate]:
"""Return the principal predicates, i.e., relations from/to the extraction subject."""
- ent = self.schema.node(ns.bsfs.Entity)
+ ent = self.schema.node(ns.bsn.Entity)
return (
pred
for pred
@@ -99,5 +111,6 @@ class Extractor(abc.ABC):
principals: typing.Iterable[bsfs.schema.Predicate],
) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
"""Return (node, predicate, value) triples."""
+ # FIXME: type annotation could be more strict: value is Hashable
## EOF ##
diff --git a/bsie/extractor/builder.py b/bsie/extractor/builder.py
new file mode 100644
index 0000000..d691b0e
--- /dev/null
+++ b/bsie/extractor/builder.py
@@ -0,0 +1,72 @@
+
+# standard imports
+import typing
+
+# bsie imports
+from bsie.utils import bsfs, errors, safe_load, unpack_qualified_name
+
+# inner-module imports
+from . import base
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'ExtractorBuilder',
+ )
+
+
+## code ##
+
+class ExtractorBuilder():
+ """Build `bsie.base.Extractor instances.
+
+ It is permissible to build multiple instances of the same extractor
+ (typically with different arguments), hence the ExtractorBuilder
+ receives a list of build specifications. Each specification is
+ a dict with a single key (extractor's qualified name) and a dict
+ to be used as keyword arguments.
+ Example: [{'bsie.extractor.generic.path.Path': {}}, ]
+
+ """
+
+ # build specifications
+ _specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]
+
+ def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]):
+ self._specs = specs
+
+ def __iter__(self) -> typing.Iterator[int]:
+ """Iterate over extractor specifications."""
+ return iter(range(len(self._specs)))
+
+ def build(self, index: int) -> base.Extractor:
+ """Return an instance of the n'th extractor (n=*index*)."""
+ # get build instructions
+ specs = self._specs[index]
+
+ # check specs structure. expecting[{name: {kwargs}}]
+ if not isinstance(specs, dict):
+ raise TypeError(f'expected a dict, found {bsfs.typename(specs)}')
+ if len(specs) != 1:
+ raise TypeError(f'expected a dict of length one, found {len(specs)}')
+
+ # get name and args from specs
+ name = next(iter(specs.keys()))
+ kwargs = specs[name]
+
+ # check kwargs structure
+ if not isinstance(kwargs, dict):
+ raise TypeError(f'expected a dict, found {bsfs.typename(kwargs)}')
+
+ # check name and get module/class components
+ module_name, class_name = unpack_qualified_name(name)
+
+ # import extractor class
+ cls = safe_load(module_name, class_name)
+
+ try: # build and return instance
+ return cls(**kwargs)
+
+ except Exception as err:
+ raise errors.BuilderError(f'failed to build extractor {name} due to {bsfs.typename(err)}: {err}') from err
+
+## EOF ##
diff --git a/bsie/extractor/generic/__init__.py b/bsie/extractor/generic/__init__.py
index 0cb7e7f..46a4bd6 100644
--- a/bsie/extractor/generic/__init__.py
+++ b/bsie/extractor/generic/__init__.py
@@ -3,11 +3,8 @@ files. Examples include file system information (file name and size, mime type,
etc.) and information that is independent of the actual file (constant triples,
host platform infos, current time, etc.).
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import typing
# exports
diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py
index 11384e6..7acbe95 100644
--- a/bsie/extractor/generic/constant.py
+++ b/bsie/extractor/generic/constant.py
@@ -1,16 +1,14 @@
"""The Constant extractor produces pre-specified triples.
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import typing
# bsie imports
-from bsie.base import extractor
from bsie.utils import bsfs, node
+# inner-module imports
+from .. import base
+
# exports
__all__: typing.Sequence[str] = (
'Constant',
@@ -19,7 +17,7 @@ __all__: typing.Sequence[str] = (
## code ##
-class Constant(extractor.Extractor):
+class Constant(base.Extractor):
"""Extract information from file's path."""
CONTENT_READER = None
@@ -32,7 +30,7 @@ class Constant(extractor.Extractor):
schema: str,
tuples: typing.Iterable[typing.Tuple[bsfs.URI, typing.Any]],
):
- super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + schema))
+ super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + schema))
# NOTE: Raises a KeyError if the predicate is not part of the schema
self._tuples = tuple((self.schema.predicate(p_uri), value) for p_uri, value in tuples)
# TODO: use schema instance for value checking
diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py
index 7018e12..00c1121 100644
--- a/bsie/extractor/generic/path.py
+++ b/bsie/extractor/generic/path.py
@@ -1,15 +1,10 @@
-"""
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
-"""
-# imports
+# standard imports
import os
import typing
# bsie imports
-from bsie.base import extractor
+from bsie.extractor import base
from bsie.utils import bsfs, node, ns
# exports
@@ -20,7 +15,7 @@ __all__: typing.Sequence[str] = (
## code ##
-class Path(extractor.Extractor):
+class Path(base.Extractor):
"""Extract information from file's path."""
CONTENT_READER = 'bsie.reader.path.Path'
@@ -29,13 +24,13 @@ class Path(extractor.Extractor):
_callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]]
def __init__(self):
- super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + '''
bse:filename rdfs:subClassOf bsfs:Predicate ;
- rdfs:domain bsfs:File ;
+ rdfs:domain bsn:Entity ;
rdfs:range xsd:string ;
rdfs:label "File name"^^xsd:string ;
schema:description "Filename of entity in some filesystem."^^xsd:string ;
- bsfs:unique "false"^^xsd:boolean .
+ bsfs:unique "true"^^xsd:boolean .
'''))
self._callmap = {
self.schema.predicate(ns.bse.filename): self.__filename,
diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py
index 0b9ce29..92b51f3 100644
--- a/bsie/extractor/generic/stat.py
+++ b/bsie/extractor/generic/stat.py
@@ -1,17 +1,15 @@
"""Extract information from the file system, such as filesize.
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import os
import typing
# bsie imports
-from bsie.base import extractor
from bsie.utils import bsfs, node, ns
+# inner-module imports
+from .. import base
+
# exports
__all__: typing.Sequence[str] = (
'Stat',
@@ -20,7 +18,7 @@ __all__: typing.Sequence[str] = (
## code ##
-class Stat(extractor.Extractor):
+class Stat(base.Extractor):
"""Extract information from the file system."""
CONTENT_READER = 'bsie.reader.stat.Stat'
@@ -29,13 +27,13 @@ class Stat(extractor.Extractor):
_callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[os.stat_result], typing.Any]]
def __init__(self):
- super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + '''
bse:filesize rdfs:subClassOf bsfs:Predicate ;
- rdfs:domain bsfs:File ;
+ rdfs:domain bsn:Entity ;
rdfs:range xsd:integer ;
rdfs:label "File size"^^xsd:string ;
schema:description "File size of entity in some filesystem."^^xsd:string ;
- bsfs:unique "false"^^xsd:boolean .
+ bsfs:unique "true"^^xsd:boolean .
'''))
self._callmap = {
self.schema.predicate(ns.bse.filesize): self.__filesize,
diff --git a/bsie/extractor/image/__init__.py b/bsie/extractor/image/__init__.py
new file mode 100644
index 0000000..f82424a
--- /dev/null
+++ b/bsie/extractor/image/__init__.py
@@ -0,0 +1,8 @@
+
+# standard imports
+import typing
+
+# exports
+__all__: typing.Sequence[str] = []
+
+## EOF ##
diff --git a/bsie/extractor/image/colors_spatial.py b/bsie/extractor/image/colors_spatial.py
new file mode 100644
index 0000000..e6661a9
--- /dev/null
+++ b/bsie/extractor/image/colors_spatial.py
@@ -0,0 +1,150 @@
+"""Spatial color features.
+"""
+# standard imports
+import typing
+
+# external imports
+import PIL.Image
+import numpy as np
+
+# bsie imports
+from bsie.utils import bsfs, node, ns
+
+# inner-module imports
+from .. import base
+
+# constants
+FEATURE_NAME = ns.bsf.ColorsSpatial()
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'ColorsSpatial',
+ )
+
+
+## code ##
+
+class ColorsSpatial(base.Extractor):
+ """Determine dominant colors of subregions in the image.
+
+ Computes the domiant color of increasingly smaller subregions of the image.
+ """
+
+ CONTENT_READER = 'bsie.reader.image.Image'
+
+ # Initial subregion width.
+ width: int
+
+ # Initial subregion height.
+ height: int
+
+ # Decrement exponent.
+ exp: float
+
+ # Principal predicate's URI.
+ _predicate_name: bsfs.URI
+
+ def __init__(
+ self,
+ width: int = 32,
+ height: int = 32,
+ exp: float = 4.,
+ ):
+ # instance identifier
+ uuid = bsfs.uuid.UCID.from_dict({
+ 'width': width,
+ 'height': height,
+ 'exp': exp,
+ })
+ # determine symbol names
+ instance_name = getattr(FEATURE_NAME, uuid)
+ predicate_name = getattr(ns.bse, 'colors_spatial_' + uuid)
+ # get vector dimension
+ dimension = self.dimension(width, height, exp)
+ # initialize parent with the schema
+ super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + f'''
+ <{FEATURE_NAME}> rdfs:subClassOf bsa:Feature ;
+ # annotations
+ rdfs:label "Spatially dominant colors"^^xsd:string ;
+ schema:description "Domiant colors of subregions in an image."^^xsd:string ;
+ bsfs:distance <https://schema.bsfs.io/core/distance#euclidean> ;
+ bsfs:dtype xsd:integer .
+
+ <{instance_name}> rdfs:subClassOf <{FEATURE_NAME}> ;
+ bsfs:dimension "{dimension}"^^xsd:integer ;
+ # annotations
+ <{FEATURE_NAME}/args#width> "{width}"^^xsd:integer ;
+ <{FEATURE_NAME}/args#height> "{height}"^^xsd:integer ;
+ <{FEATURE_NAME}/args#exp> "{exp}"^^xsd:float .
+
+ <{predicate_name}> rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range <{instance_name}> ;
+ bsfs:unique "true"^^xsd:boolean .
+
+ '''))
+ # assign extra members
+ self.width = width
+ self.height = height
+ self.exp = exp
+ self._predicate_name = predicate_name
+
+ def __repr__(self) -> str:
+ return f'{bsfs.typename(self)}({self.width}, {self.height}, {self.exp})'
+
+ def __eq__(self, other: typing.Any) -> bool:
+ return super().__eq__(other) \
+ and self.width == other.width \
+ and self.height == other.height \
+ and self.exp == other.exp
+
+ def __hash__(self) -> int:
+ return hash((super().__hash__(), self.width, self.height, self.exp))
+
+ @staticmethod
+ def dimension(width: int, height: int, exp: float) -> int:
+ """Return the feature vector dimension."""
+ # FIXME: replace with a proper formula
+ dim = 0
+ while width >= 1 and height >= 1:
+ dim += width * height
+ width = np.floor(width / exp)
+ height = np.floor(height / exp)
+ dim *= 3 # per band
+ return int(dim)
+
+ def extract(
+ self,
+ subject: node.Node,
+ content: PIL.Image.Image,
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ # check principals
+ if self.schema.predicate(self._predicate_name) not in principals:
+ # nothing to do; abort
+ return
+
+ # convert to HSV
+ content = content.convert('HSV')
+
+ # get dimensions
+ width, height = self.width, self.height
+ num_bands = len(content.getbands()) # it's three since we converted to HSV before
+
+ features = []
+ while width >= 1 and height >= 1:
+ # downsample
+ img = content.resize((width, height), resample=PIL.Image.Resampling.BOX)
+ # feature vector
+ features.append(
+ np.array(img.getdata()).reshape((width * height, num_bands)))
+ # iterate
+ width = int(np.floor(width / self.exp))
+ height = int(np.floor(height / self.exp))
+
+ # combine bands and convert features to tuple
+ value = tuple(np.vstack(features).reshape(-1))
+ # return triple with feature vector as value
+ yield subject, self.schema.predicate(self._predicate_name), value
+
+## EOF ##
diff --git a/bsie/extractor/image/photometrics.py b/bsie/extractor/image/photometrics.py
new file mode 100644
index 0000000..42eb3c8
--- /dev/null
+++ b/bsie/extractor/image/photometrics.py
@@ -0,0 +1,211 @@
+
+# standard imports
+from fractions import Fraction
+import typing
+
+# bsie imports
+from bsie.utils import bsfs, node, ns
+
+# inner-module imports
+from .. import base
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Exif',
+ )
+
+
+## code ##
+
+def _gps_to_dec(coords: typing.Tuple[float, float, float]) -> float:
+ """Convert GPS coordinates from exif to float."""
+ # unpack args
+ deg, min, sec = coords # pylint: disable=redefined-builtin # min
+ # convert to float
+ deg = float(Fraction(deg))
+ min = float(Fraction(min))
+ sec = float(Fraction(sec))
+
+ if float(sec) > 0:
+ # format is deg+min+sec
+ return (float(deg) * 3600 + float(min) * 60 + float(sec)) / 3600
+ # format is deg+min
+ return float(deg) + float(min) / 60
+
+
+class Exif(base.Extractor):
+ """Extract information from EXIF/IPTC tags of an image file."""
+
+ CONTENT_READER = 'bsie.reader.exif.Exif'
+
+ def __init__(self):
+ super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + '''
+ #bse:t_capture rdfs:subClassOf bsfs:Predicate ;
+ # rdfs:domain bsn:Entity ;
+ # rdfs:range xsd:float ;
+ # bsfs:unique "true"^^xsd:boolean .
+ bse:exposure rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:float ;
+ bsfs:unique "true"^^xsd:boolean .
+ bse:aperture rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:float ;
+ bsfs:unique "true"^^xsd:boolean .
+ bse:iso rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+ bse:focal_length rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:float ;
+ bsfs:unique "true"^^xsd:boolean .
+ bse:width rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+ bse:height rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+ bse:orientation rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+ bse:orientation_label rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ bse:altitude rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:float ;
+ bsfs:unique "true"^^xsd:boolean .
+ bse:latitude rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:float ;
+ bsfs:unique "true"^^xsd:boolean .
+ bse:longitude rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:float ;
+ bsfs:unique "true"^^xsd:boolean .
+ '''))
+ # initialize mapping from predicate to callback
+ self._callmap = {
+ #self.schema.predicate(ns.bse.t_capture): self._date,
+ self.schema.predicate(ns.bse.exposure): self._exposure,
+ self.schema.predicate(ns.bse.aperture): self._aperture,
+ self.schema.predicate(ns.bse.iso): self._iso,
+ self.schema.predicate(ns.bse.focal_length): self._focal_length,
+ self.schema.predicate(ns.bse.width): self._width,
+ self.schema.predicate(ns.bse.height): self._height,
+ self.schema.predicate(ns.bse.orientation): self._orientation,
+ self.schema.predicate(ns.bse.orientation_label): self._orientation_label,
+ self.schema.predicate(ns.bse.altitude): self._altitude,
+ self.schema.predicate(ns.bse.latitude): self._latitude,
+ self.schema.predicate(ns.bse.longitude): self._longitude,
+ }
+
+ def extract(
+ self,
+ subject: node.Node,
+ content: dict,
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ for pred in principals:
+ # find callback
+ clbk = self._callmap.get(pred)
+ if clbk is None:
+ continue
+ # get value
+ value = clbk(content)
+ if value is None:
+ continue
+ # produce triple
+ yield subject, pred, value
+
+ #def _date(self, content: dict): # FIXME: Return type annotation
+ # date_keys = (
+ # 'Exif.Photo.DateTimeOriginal',
+ # 'Exif.Photo.DateTimeDigitized',
+ # 'Exif.Image.DateTime',
+ # )
+ # for key in date_keys:
+ # if key in content:
+ # dt = content[key].value
+ # if dt.tzinfo is None:
+ # dt = dt.replace(tzinfo=ttime.NoTimeZone)
+ # return dt
+ # return None
+
+
+ ## photometrics
+
+ def _exposure(self, content: dict) -> typing.Optional[float]:
+ if 'Exif.Photo.ExposureTime' in content:
+ return 1.0 / float(Fraction(content['Exif.Photo.ExposureTime']))
+ return None
+
+ def _aperture(self, content: dict) -> typing.Optional[float]:
+ if 'Exif.Photo.FNumber' in content:
+ return float(Fraction(content['Exif.Photo.FNumber']))
+ return None
+
+ def _iso(self, content: dict) -> typing.Optional[int]:
+ if 'Exif.Photo.ISOSpeedRatings' in content:
+ return int(content['Exif.Photo.ISOSpeedRatings'])
+ return None
+
+ def _focal_length(self, content: dict) -> typing.Optional[float]:
+ if 'Exif.Photo.FocalLength' in content:
+ return float(Fraction(content['Exif.Photo.FocalLength']))
+ return None
+
+
+ ## image dimensions
+
+ def _width(self, content: dict) -> typing.Optional[int]:
+ # FIXME: consider orientation!
+ if 'Exif.Photo.PixelXDimension' in content:
+ return int(content['Exif.Photo.PixelXDimension'])
+ return None
+
+ def _height(self, content: dict) -> typing.Optional[int]:
+ # FIXME: consider orientation!
+ if 'Exif.Photo.PixelYDimension' in content:
+ return int(content['Exif.Photo.PixelYDimension'])
+ return None
+
+ def _orientation(self, content: dict) -> typing.Optional[int]:
+ if 'Exif.Image.Orientation' in content:
+ return int(content['Exif.Image.Orientation'])
+ return None
+
+ def _orientation_label(self, content: dict) -> typing.Optional[str]:
+ width = self._width(content)
+ height = self._height(content)
+ ori = self._orientation(content)
+ if width is not None and height is not None and ori is not None:
+ if ori <= 4:
+ return 'landscape' if width >= height else 'portrait'
+ return 'portrait' if width >= height else 'landscape'
+ return None
+
+
+ ## location
+
+ def _altitude(self, content: dict) -> typing.Optional[float]:
+ if 'Exif.GPSInfo.GPSAltitude' in content:
+ return float(Fraction(content['Exif.GPSInfo.GPSAltitude']))
+ return None
+
+ def _latitude(self, content: dict) -> typing.Optional[float]:
+ if 'Exif.GPSInfo.GPSLatitude' in content:
+ return _gps_to_dec(content['Exif.GPSInfo.GPSLatitude'].split())
+ return None
+
+ def _longitude(self, content: dict) -> typing.Optional[float]:
+ if 'Exif.GPSInfo.GPSLongitude' in content:
+ return _gps_to_dec(content['Exif.GPSInfo.GPSLongitude'].split())
+ return None
+
+## EOF ##
diff --git a/bsie/extractor/preview.py b/bsie/extractor/preview.py
new file mode 100644
index 0000000..145a01a
--- /dev/null
+++ b/bsie/extractor/preview.py
@@ -0,0 +1,96 @@
+
+# imports
+import io
+import typing
+
+# external imports
+import PIL.Image
+
+# bsie imports
+from bsie.utils import bsfs, node, ns
+
+# inner-module imports
+from . import base
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Preview',
+ )
+
+
+## code ##
+
+class Preview(base.Extractor):
+ """Extract previews."""
+
+ CONTENT_READER = 'bsie.reader.preview.Preview'
+
+ def __init__(self, max_sides: typing.Iterable[int]):
+ super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + '''
+
+
+
+ bsn:Preview rdfs:subClassOf bsfs:Node .
+ bsl:BinaryBlob rdfs:subClassOf bsfs:Literal .
+ <https://schema.bsfs.io/ie/Literal/BinaryBlob/JPEG> rdfs:subClassOf bsl:BinaryBlob .
+
+ bse:preview rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range bsn:Preview ;
+ bsfs:unique "false"^^xsd:boolean .
+
+ bsp:width rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Preview ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+
+ bsp:height rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Preview ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+
+ bsp:asset rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Preview ;
+ rdfs:range <https://schema.bsfs.io/ie/Literal/BinaryBlob/JPEG> ;
+ bsfs:unique "true"^^xsd:boolean .
+
+ '''))
+ # initialize extra args
+ self.max_sides = set(max_sides)
+
+ def __eq__(self, other: typing.Any) -> bool:
+ return super().__eq__(other) \
+ and self.max_sides == other.max_sides
+
+ def __hash__(self) -> int:
+ return hash((super().__hash__(), tuple(sorted(self.max_sides))))
+
+ def extract(
+ self,
+ subject: node.Node,
+ content: typing.Callable[[int], PIL.Image.Image],
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ # check principals
+ if self.schema.predicate(ns.bse.preview) not in principals:
+ return
+
+ for max_side in self.max_sides:
+ # get the preview in the right resolution
+ img = content(max_side)
+ # convert the preview to jpeg
+ buffer = io.BytesIO()
+ img.save(buffer, format='jpeg')
+ # create a preview node
+ preview = node.Node(ns.bsn.Preview,
+ ucid=bsfs.uuid.UCID.from_bytes(buffer.getvalue()),
+ size=max_side,
+ source=subject,
+ )
+ # yield triples
+ yield subject, self.schema.predicate(ns.bse.preview), preview
+ yield preview, self.schema.predicate(ns.bsp.width), img.width
+ yield preview, self.schema.predicate(ns.bsp.height), img.height
+ yield preview, self.schema.predicate(ns.bsp.asset), buffer.getvalue()
+
+## EOF ##
diff --git a/bsie/lib/__init__.py b/bsie/lib/__init__.py
index 578c2c4..f44fb74 100644
--- a/bsie/lib/__init__.py
+++ b/bsie/lib/__init__.py
@@ -1,18 +1,16 @@
-"""
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
-"""
-# imports
+# standard imports
import typing
# inner-module imports
from .bsie import BSIE
+from .builder import PipelineBuilder
+from .naming_policy import DefaultNamingPolicy
# exports
__all__: typing.Sequence[str] = (
'BSIE',
+ 'PipelineBuilder',
)
## EOF ##
diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py
index e087fa9..b02e707 100644
--- a/bsie/lib/bsie.py
+++ b/bsie/lib/bsie.py
@@ -1,16 +1,14 @@
-"""
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
-"""
-# imports
+# standard imports
import typing
# bsie imports
-from bsie.tools import Pipeline
from bsie.utils import bsfs, node, ns
+# inner-module imports
+from .naming_policy import NamingPolicy
+from .pipeline import Pipeline
+
# exports
__all__: typing.Sequence[str] = (
'BSIE',
@@ -39,15 +37,18 @@ class BSIE():
def __init__(
self,
- # pipeline builder.
+ # pipeline.
pipeline: Pipeline,
+ # naming policy
+ naming_policy: NamingPolicy,
# principals to extract at most. None implies all available w.r.t. extractors.
collect: typing.Optional[typing.Iterable[bsfs.URI]] = None,
# principals to discard.
discard: typing.Optional[typing.Iterable[bsfs.URI]] = None,
):
- # store pipeline
+ # store pipeline and naming policy
self._pipeline = pipeline
+ self._naming_policy = naming_policy
# start off with available principals
self._principals = {pred.uri for pred in self._pipeline.principals}
# limit principals to specified ones by argument.
@@ -87,6 +88,6 @@ class BSIE():
# predicate lookup
principals = {self.schema.predicate(pred) for pred in principals}
# invoke pipeline
- yield from self._pipeline(path, principals)
+ yield from self._naming_policy(self._pipeline(path, principals))
## EOF ##
diff --git a/bsie/lib/builder.py b/bsie/lib/builder.py
new file mode 100644
index 0000000..3a15311
--- /dev/null
+++ b/bsie/lib/builder.py
@@ -0,0 +1,75 @@
+
+# standard imports
+import logging
+import typing
+
+# bsie imports
+from bsie.extractor import ExtractorBuilder
+from bsie.reader import ReaderBuilder
+from bsie.utils import errors
+
+# inner-module imports
+from . import pipeline
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'PipelineBuilder',
+ )
+
+
+## code ##
+
+logger = logging.getLogger(__name__)
+
+class PipelineBuilder():
+ """Build `bsie.tools.pipeline.Pipeline` instances."""
+
+ # builder for Readers.
+ rbuild: ReaderBuilder
+
+ # builder for Extractors.
+ ebuild: ExtractorBuilder
+
+ def __init__(
+ self,
+ reader_builder: ReaderBuilder,
+ extractor_builder: ExtractorBuilder,
+ ):
+ self.rbuild = reader_builder
+ self.ebuild = extractor_builder
+
+ def build(self) -> pipeline.Pipeline:
+ """Return a Pipeline instance."""
+ ext2rdr = {}
+
+ for eidx in self.ebuild:
+ # build extractor
+ try:
+ ext = self.ebuild.build(eidx)
+
+ except errors.LoaderError as err: # failed to load extractor; skip
+ logger.error('failed to load extractor: %s', err)
+ continue
+
+ except errors.BuilderError as err: # failed to build instance; skip
+ logger.error(str(err))
+ continue
+
+ try:
+ # get reader required by extractor
+ if ext.CONTENT_READER is not None:
+ rdr = self.rbuild.build(ext.CONTENT_READER)
+ else:
+ rdr = None
+ # store extractor
+ ext2rdr[ext] = rdr
+
+ except errors.LoaderError as err: # failed to load reader
+ logger.error('failed to load reader: %s', err)
+
+ except errors.BuilderError as err: # failed to build reader
+ logger.error(str(err))
+
+ return pipeline.Pipeline(ext2rdr)
+
+## EOF ##
diff --git a/bsie/lib/naming_policy.py b/bsie/lib/naming_policy.py
new file mode 100644
index 0000000..9b9a45d
--- /dev/null
+++ b/bsie/lib/naming_policy.py
@@ -0,0 +1,115 @@
+
+# standard imports
+import abc
+import os
+import typing
+
+# bsie imports
+from bsie.utils import bsfs, errors, ns
+from bsie.utils.node import Node
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'DefaultNamingPolicy',
+ )
+
+
+## code ##
+
+class NamingPolicy():
+ """Determine node uri's from node hints."""
+ def __call__(
+ self,
+ iterable: typing.Iterable[typing.Tuple[Node, bsfs.URI, typing.Any]],
+ ):
+ """Apply the policy on a triple iterator."""
+ return NamingPolicyIterator(self, iterable)
+
+ @abc.abstractmethod
+ def handle_node(self, node: Node) -> Node:
+ """Apply the policy on a node."""
+
+
+class NamingPolicyIterator():
+ """Iterates over triples, determines uris according to a *policy* as it goes."""
+
+ # source triple iterator.
+ _iterable: typing.Iterable[typing.Tuple[Node, bsfs.URI, typing.Any]]
+
+ # naming policy
+ _policy: NamingPolicy
+
+ def __init__(
+ self,
+ policy: NamingPolicy,
+ iterable: typing.Iterable[typing.Tuple[Node, bsfs.URI, typing.Any]],
+ ):
+ self._iterable = iterable
+ self._policy = policy
+
+ def __iter__(self):
+ for node, pred, value in self._iterable:
+ # handle subject
+ self._policy.handle_node(node)
+ # handle value
+ if isinstance(value, Node):
+ self._policy.handle_node(value)
+ # yield triple
+ yield node, pred, value
+
+
+class DefaultNamingPolicy(NamingPolicy):
+ """Compose URIs as <host/user/node_type#fragment>
+
+ What information is used as fragment depends on the node type.
+ Typically, the default is to use the "ucid" hint.
+ The fallback in all cases is to generate a random uuid.
+
+ Never changes previously assigned uris. Sets uris in-place.
+
+ """
+
+ def __init__(
+ self,
+ host: bsfs.URI,
+ user: str,
+ ):
+ self._prefix = bsfs.Namespace(os.path.join(host, user))
+ self._uuid = bsfs.uuid.UUID()
+
+ def handle_node(self, node: Node) -> Node:
+ if node.uri is not None:
+ return node
+ if node.node_type == ns.bsn.Entity :
+ return self.name_file(node)
+ if node.node_type == ns.bsn.Preview:
+ return self.name_preview(node)
+ raise errors.ProgrammingError('no naming policy available for {node.node_type}')
+
+ def name_file(self, node: Node) -> Node:
+ """Set a bsfs:File node's uri fragment to its ucid."""
+ if 'ucid' in node.hints: # content id
+ fragment = node.hints['ucid']
+ else: # random name
+ fragment = self._uuid()
+ node.uri = getattr(self._prefix.file(), fragment)
+ return node
+
+ def name_preview(self, node: Node) -> Node:
+ """Set a bsfs:Preview node's uri fragment to its ucid.
+ Uses its source fragment as fallback. Appends the size if provided.
+ """
+ fragment = None
+ if 'ucid' in node.hints: # content id
+ fragment = node.hints['ucid']
+ if fragment is None and 'source' in node.hints: # source id
+ self.handle_node(node.hints['source'])
+ fragment = node.hints['source'].uri.get('fragment', None)
+ if fragment is None: # random name
+ fragment = self._uuid()
+ if 'size' in node.hints: # append size
+ fragment += '_s' + str(node.hints['size'])
+ node.uri = getattr(self._prefix.preview(), fragment)
+ return node
+
+## EOF ##
diff --git a/bsie/tools/pipeline.py b/bsie/lib/pipeline.py
index 20e8ddf..30fd6fd 100644
--- a/bsie/tools/pipeline.py
+++ b/bsie/lib/pipeline.py
@@ -1,25 +1,19 @@
-"""
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
-"""
-# imports
+# standard imports
from collections import defaultdict
import logging
import typing
# bsie imports
-from bsie import base
-from bsie.utils import bsfs, node, ns
+from bsie.extractor import Extractor
+from bsie.reader import Reader
+from bsie.utils import bsfs, errors, node, ns
# exports
__all__: typing.Sequence[str] = (
'Pipeline',
)
-# constants
-FILE_PREFIX = 'file#'
## code ##
@@ -39,19 +33,14 @@ class Pipeline():
# combined extractor schemas.
_schema: bsfs.schema.Schema
- # node prefix.
- _prefix: bsfs.Namespace
-
# extractor -> reader mapping
- _ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]]
+ _ext2rdr: typing.Dict[Extractor, typing.Optional[Reader]]
def __init__(
self,
- prefix: bsfs.Namespace,
- ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]]
+ ext2rdr: typing.Dict[Extractor, typing.Optional[Reader]]
):
# store core members
- self._prefix = prefix + FILE_PREFIX
self._ext2rdr = ext2rdr
# compile schema from all extractors
self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr)
@@ -63,12 +52,11 @@ class Pipeline():
return f'{bsfs.typename(self)}(...)'
def __hash__(self) -> int:
- return hash((type(self), self._prefix, self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values())))
+ return hash((type(self), self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values())))
def __eq__(self, other: typing.Any) -> bool:
return isinstance(other, type(self)) \
and self._schema == other._schema \
- and self._prefix == other._prefix \
and self._ext2rdr == other._ext2rdr
@property
@@ -116,27 +104,33 @@ class Pipeline():
rdr2ext[rdr].add(ext)
# create subject for file
- uuid = bsfs.uuid.UCID.from_path(path)
- subject = node.Node(ns.bsfs.File, self._prefix[uuid])
+ subject = node.Node(ns.bsn.Entity,
+ ucid=bsfs.uuid.UCID.from_path(path),
+ )
# extract information
for rdr, extrs in rdr2ext.items():
try:
# get content
content = rdr(path) if rdr is not None else None
+ #logger.info('extracted %s from %s', rdr, path)
# apply extractors on this content
for ext in extrs:
try:
# get predicate/value tuples
- for subject, pred, value in ext.extract(subject, content, principals):
- yield subject, pred, value
+ yield from ext.extract(subject, content, principals)
- except base.errors.ExtractorError as err:
+ except errors.ExtractorError as err:
# critical extractor failure.
logger.error('%s failed to extract triples from content: %s', ext, err)
- except base.errors.ReaderError as err:
+ except errors.UnsupportedFileFormatError:
+ # failed to read the file format. skip.
+ #logger.warning('%s could not process the file format of %s', rdr, err)
+ pass
+
+ except errors.ReaderError as err:
# failed to read any content. skip.
logger.error('%s failed to read content: %s', rdr, err)
diff --git a/bsie/reader/__init__.py b/bsie/reader/__init__.py
index a45f22b..a1c38a9 100644
--- a/bsie/reader/__init__.py
+++ b/bsie/reader/__init__.py
@@ -1,8 +1,8 @@
"""The Reader classes return high-level content structures from files.
The Reader fulfills two purposes:
- First, it brokers between multiple libraries and file formats.
- Second, it separates multiple aspects of a file into distinct content types.
+First, it brokers between multiple libraries and file formats.
+Second, it separates multiple aspects of a file into distinct content types.
Often, different libraries focus on reading different types of content from a
file. E.g. one would use different modules to read file system infos than to
@@ -11,9 +11,18 @@ type. Each distinct type can be implemented in a file or submodule that
provides a Reader implementation. Through utilization of submodules, different
file formats can be supported.
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
"""
+# standard imports
+import typing
+
+# inner-module imports
+from .base import Reader
+from .builder import ReaderBuilder
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Reader',
+ 'ReaderBuilder',
+ )
## EOF ##
diff --git a/bsie/base/reader.py b/bsie/reader/base.py
index cbabd36..a775701 100644
--- a/bsie/base/reader.py
+++ b/bsie/reader/base.py
@@ -1,14 +1,5 @@
-"""The Reader classes return high-level content structures from files.
-The Reader fulfills two purposes:
- First, it brokers between multiple libraries and file formats.
- Second, it separates multiple aspects of a file into distinct content types.
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
-"""
-# imports
+# standard imports
import abc
import typing
@@ -39,7 +30,7 @@ class Reader(abc.ABC):
return hash(type(self))
@abc.abstractmethod
- def __call__(self, path: bsfs.URI) -> typing.Any:
+ def __call__(self, path: str) -> typing.Any:
"""Return some content of the file at *path*.
Raises a `ReaderError` if the reader cannot make sense of the file format.
"""
diff --git a/bsie/reader/builder.py b/bsie/reader/builder.py
new file mode 100644
index 0000000..d32700b
--- /dev/null
+++ b/bsie/reader/builder.py
@@ -0,0 +1,73 @@
+
+# standard imports
+import typing
+
+# bsie imports
+from bsie.utils import bsfs, errors, safe_load, unpack_qualified_name
+
+# inner-module imports
+from . import base
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'ReaderBuilder',
+ )
+
+
+## code ##
+
+class ReaderBuilder():
+ """Build `bsie.base.Reader` instances.
+
+ Readers are defined via their qualified class name
+ (e.g., bsie.reader.path.Path) and optional keyword
+ arguments that are passed to the constructor via
+ the *kwargs* argument (name as key, kwargs as value).
+ The ReaderBuilder keeps a cache of previously built
+ reader instances, as they are anyway built with
+ identical keyword arguments.
+
+ """
+
+ # keyword arguments
+ _kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]
+
+ # cached readers
+ _cache: typing.Dict[str, base.Reader]
+
+ def __init__(
+ self,
+ kwargs: typing.Optional[typing.Dict[str, typing.Dict[str, typing.Any]]] = None):
+ if kwargs is None:
+ kwargs = {}
+ self._kwargs = kwargs
+ self._cache = {}
+
+ def build(self, name: str) -> base.Reader:
+ """Return an instance for the qualified class name."""
+ # return cached instance
+ if name in self._cache:
+ return self._cache[name]
+
+ # check name and get module/class components
+ module_name, class_name = unpack_qualified_name(name)
+
+ # import reader class
+ cls = safe_load(module_name, class_name)
+
+ # get kwargs
+ kwargs = self._kwargs.get(name, {})
+ if not isinstance(kwargs, dict):
+ raise TypeError(f'expected a kwargs dict, found {bsfs.typename(kwargs)}')
+
+ try: # build, cache, and return instance
+ obj = cls(**kwargs)
+ # cache instance
+ self._cache[name] = obj
+ # return instance
+ return obj
+
+ except Exception as err:
+ raise errors.BuilderError(f'failed to build reader {name} due to {bsfs.typename(err)}: {err}') from err
+
+## EOF ##
diff --git a/bsie/reader/chain.py b/bsie/reader/chain.py
new file mode 100644
index 0000000..79b44b4
--- /dev/null
+++ b/bsie/reader/chain.py
@@ -0,0 +1,86 @@
+
+# standard imports
+import logging
+import typing
+
+# bsie imports
+from bsie.utils import bsfs, errors
+
+# inner-module imports
+from . import base
+from . import builder
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'ReaderChain',
+ )
+
+
+## code ##
+
+logger = logging.getLogger(__name__)
+
+# Content type.
+T_CONTENT = typing.TypeVar('T_CONTENT') # pylint: disable=invalid-name
+
+class ReaderChain(base.Reader, typing.Generic[T_CONTENT]):
+ """Read an image."""
+
+ # sub-readers for specific file formats.
+ _children: typing.Tuple[base.Reader, ...]
+
+ def __init__(
+ self,
+ subreader_names: typing.Iterable[str],
+ cfg: typing.Optional[typing.Any] = None,
+ ):
+ rbuild = builder.ReaderBuilder(cfg)
+ children = []
+ for name in subreader_names:
+ try:
+ # build sub-reader
+ children.append(rbuild.build(name))
+ except (ValueError,
+ TypeError,
+ errors.LoaderError,
+ errors.BuilderError) as err:
+ # failed to build a child; skip and notify
+ logger.warning('failed to load reader: %s', err)
+
+ if len(children) == 0:
+ logger.warning('%s failed to load any sub-readers.', bsfs.typename(self))
+
+ # copy children to member
+ self._children = tuple(children)
+
+ def __str__(self) -> str:
+ substr = ', '.join(str(child) for child in self._children)
+ return f'{bsfs.typename(self)}({substr})'
+
+ def __repr__(self) -> str:
+ return f'{bsfs.typename(self)}({self._children})'
+
+ def __eq__(self, other: typing.Any) -> bool:
+ return super().__eq__(other) \
+ and self._children == other._children
+
+ def __hash__(self) -> int:
+ return hash((super().__hash__(), self._children))
+
+ def __call__(self, path: str) -> T_CONTENT:
+ raise_error = False
+ for child in self._children:
+ try:
+ return child(path)
+ except errors.UnsupportedFileFormatError:
+ # child cannot read the file, skip.
+ pass
+ except errors.ReaderError:
+ # child failed to read the file, skip.
+ raise_error = True
+
+ if raise_error:
+ raise errors.ReaderError(path)
+ raise errors.UnsupportedFileFormatError(path)
+
+## EOF ##
diff --git a/bsie/reader/exif.py b/bsie/reader/exif.py
new file mode 100644
index 0000000..2d0428b
--- /dev/null
+++ b/bsie/reader/exif.py
@@ -0,0 +1,44 @@
+
+# standard imports
+import typing
+
+# external imports
+import pyexiv2
+
+# bsie imports
+from bsie.utils import errors, filematcher
+
+# inner-module imports
+from . import base
+
+# constants
+MATCH_RULE = 'mime=image/jpeg'
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Exif',
+ )
+
+
+## code ##
+
+class Exif(base.Reader):
+ """Use pyexiv2 to read exif metadata from image files."""
+
+ def __init__(self):
+ self._match = filematcher.parse(MATCH_RULE)
+
+ def __call__(self, path: str) -> dict:
+ # perform quick checks first
+ if not self._match(path):
+ raise errors.UnsupportedFileFormatError(path)
+
+ try:
+ # open the file
+ img = pyexiv2.Image(path)
+ # read metadata
+ return img.read_exif()
+ except (TypeError, OSError, RuntimeError) as err:
+ raise errors.ReaderError(path) from err
+
+## EOF ##
diff --git a/bsie/reader/image/__init__.py b/bsie/reader/image/__init__.py
new file mode 100644
index 0000000..89642f2
--- /dev/null
+++ b/bsie/reader/image/__init__.py
@@ -0,0 +1,31 @@
+
+# standard imports
+import typing
+
+# external imports
+import PIL.Image
+
+# inner-module imports
+from .. import chain
+
+# constants
+_FILE_FORMAT_READERS: typing.Sequence[str] = (
+ __package__ + '._raw.RawImage',
+ __package__ + '._pillow.PillowImage',
+ )
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Image',
+ )
+
+
+## code ##
+
+class Image(chain.ReaderChain[PIL.Image.Image]): # pylint: disable=too-few-public-methods
+ """Read an image file."""
+
+ def __init__(self, cfg: typing.Optional[typing.Any] = None):
+ super().__init__(_FILE_FORMAT_READERS, cfg)
+
+## EOF ##
diff --git a/bsie/reader/image/_pillow.py b/bsie/reader/image/_pillow.py
new file mode 100644
index 0000000..0611d3c
--- /dev/null
+++ b/bsie/reader/image/_pillow.py
@@ -0,0 +1,34 @@
+
+# standard imports
+import typing
+
+# external imports
+import PIL.Image
+
+# bsie imports
+from bsie.utils import errors
+
+# inner-module imports
+from .. import base
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'PillowImage',
+ )
+
+
+## code ##
+
+class PillowImage(base.Reader):
+ """Use PIL to read content of a variety of image file types."""
+
+ def __call__(self, path: str) -> PIL.Image.Image:
+ try:
+ # open file with PIL
+ return PIL.Image.open(path)
+ except PIL.UnidentifiedImageError as err:
+ raise errors.UnsupportedFileFormatError(path) from err
+ except IOError as err:
+ raise errors.ReaderError(path) from err
+
+# EOF ##
diff --git a/bsie/reader/image/_raw.py b/bsie/reader/image/_raw.py
new file mode 100644
index 0000000..e5745aa
--- /dev/null
+++ b/bsie/reader/image/_raw.py
@@ -0,0 +1,56 @@
+
+# standard imports
+import typing
+
+# external imports
+import PIL.Image
+import rawpy
+
+# bsie imports
+from bsie.utils import errors, filematcher
+
+# inner-module imports
+from .. import base
+
+# constants
+MATCH_RULE = 'mime={image/x-nikon-nef} | extension={nef}'
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'RawImage',
+ )
+
+
+## code ##
+
+class RawImage(base.Reader):
+ """Use rawpy to read content of raw image file types."""
+
+ # file matcher
+ _match: filematcher.Matcher
+
+ # additional kwargs to rawpy's postprocess
+ _rawpy_kwargs: typing.Dict[str, typing.Any]
+
+ def __init__(self, **rawpy_kwargs):
+ match_rule = rawpy_kwargs.pop('file_match_rule', MATCH_RULE)
+ self._match = filematcher.parse(match_rule)
+ self._rawpy_kwargs = rawpy_kwargs
+
+ def __call__(self, path: str) -> PIL.Image.Image:
+ # perform quick checks first
+ if not self._match(path):
+ raise errors.UnsupportedFileFormatError(path)
+
+ try:
+ # open file with rawpy
+ ary = rawpy.imread(path).postprocess(**self._rawpy_kwargs)
+ # convert to PIL.Image
+ return PIL.Image.fromarray(ary)
+ except (rawpy.LibRawFatalError, # pylint: disable=no-member # pylint doesn't find the errors
+ rawpy.NotSupportedError, # pylint: disable=no-member
+ rawpy.LibRawNonFatalError, # pylint: disable=no-member
+ ) as err:
+ raise errors.ReaderError(path) from err
+
+## EOF ##
diff --git a/bsie/reader/path.py b/bsie/reader/path.py
index d60f187..45eb127 100644
--- a/bsie/reader/path.py
+++ b/bsie/reader/path.py
@@ -1,14 +1,10 @@
"""The Path reader produces a file path.
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import typing
-# bsie imports
-from bsie.base import reader
+# inner-module imports
+from . import base
# exports
__all__: typing.Sequence[str] = (
@@ -18,7 +14,7 @@ __all__: typing.Sequence[str] = (
## code ##
-class Path(reader.Reader):
+class Path(base.Reader):
"""Return the path."""
def __call__(self, path: str) -> str:
diff --git a/bsie/reader/preview/__init__.py b/bsie/reader/preview/__init__.py
new file mode 100644
index 0000000..791a133
--- /dev/null
+++ b/bsie/reader/preview/__init__.py
@@ -0,0 +1,34 @@
+
+# imports
+import typing
+
+# external imports
+import PIL.Image
+
+# inner-module imports
+from .. import chain
+
+# constants
+_FILE_FORMAT_READERS: typing.Sequence[str] = (
+ # native image formats
+ __package__ + '._pillow.PillowPreviewReader',
+ __package__ + '._rawpy.RawpyPreviewReader',
+ # multiformat readers
+ __package__ + '._pg.PreviewGeneratorReader',
+ )
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Preview',
+ )
+
+
+## code ##
+
+class Preview(chain.ReaderChain[typing.Callable[[int], PIL.Image.Image]]): # pylint: disable=too-few-public-methods
+ """Create a preview from a file."""
+
+ def __init__(self, cfg: typing.Optional[typing.Any] = None):
+ super().__init__(_FILE_FORMAT_READERS, cfg)
+
+## EOF ##
diff --git a/bsie/reader/preview/_pg.py b/bsie/reader/preview/_pg.py
new file mode 100644
index 0000000..401b33d
--- /dev/null
+++ b/bsie/reader/preview/_pg.py
@@ -0,0 +1,81 @@
+
+# standard imports
+from functools import partial
+import contextlib
+import io
+import os
+import shutil
+import tempfile
+import typing
+
+# external imports
+from preview_generator.manager import PreviewManager
+import PIL.Image
+
+# bsie imports
+from bsie.utils import errors
+
+# inner-module imports
+from .. import base
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'PreviewGeneratorReader',
+ )
+
+
+## code ##
+
+class PreviewGeneratorReader(base.Reader):
+ """Uses preview_generator to create previews for various data formats.
+ See `https://github.com/algoo/preview-generator`_ for details.
+ """
+
+ # PreviewManager instance.
+ _mngr: PreviewManager
+
+ # Set of mime types supported by PreviewManager.
+ _supported_mimetypes: typing.Set[str]
+
+ # PreviewManager cache.
+ _cache: str
+
+ # Determines whether the cache directory should be deleted after use.
+ _cleanup: bool
+
+ def __init__(self, cache: typing.Optional[str] = None):
+ # initialize cache directory
+ # TODO: initialize in memory, e.g., via PyFilesystem
+ if cache is None:
+ self._cache = tempfile.mkdtemp(prefix='bsie-preview-cache-')
+ self._cleanup = True
+ else:
+ self._cache = cache
+ self._cleanup = False
+ # create preview generator
+ with contextlib.redirect_stderr(io.StringIO()):
+ self._mngr = PreviewManager(self._cache, create_folder=True)
+ self._supported_mimetypes = set(self._mngr.get_supported_mimetypes())
+
+ def __del__(self):
+ if self._cleanup:
+ shutil.rmtree(self._cache, ignore_errors=True)
+
+ def __call__(self, path: str) -> typing.Callable[[int], PIL.Image.Image]:
+ if not os.path.exists(path):
+ raise errors.ReaderError(path)
+ if self._mngr.get_mimetype(path) not in self._supported_mimetypes:
+ raise errors.UnsupportedFileFormatError(path)
+ return partial(self._preview_callback, path)
+
+ def _preview_callback(self, path: str, max_side: int) -> PIL.Image.Image:
+ """Produce a jpeg preview of *path* with at most *max_side* side length."""
+ try:
+ # generate the preview
+ preview_path = self._mngr.get_jpeg_preview(path, width=max_side, height=max_side)
+ # open the preview and return
+ return PIL.Image.open(preview_path)
+ except Exception as err: # FIXME: less generic exception!
+ raise errors.ReaderError(path) from err
+
+## EOF ##
diff --git a/bsie/reader/preview/_pillow.py b/bsie/reader/preview/_pillow.py
new file mode 100644
index 0000000..2b797c6
--- /dev/null
+++ b/bsie/reader/preview/_pillow.py
@@ -0,0 +1,39 @@
+
+# standard imports
+from functools import partial
+import typing
+
+# external imports
+import PIL.Image
+
+# bsie imports
+from bsie.utils import errors
+
+# inner-module imports
+from . import utils
+from .. import base
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'PillowPreviewReader',
+ )
+
+
+## code ##
+
+class PillowPreviewReader(base.Reader):
+ """Produce previews for image files using the Pillow library."""
+
+ def __call__(self, path: str) -> typing.Callable[[int], PIL.Image.Image]:
+ try:
+ # open file with PIL
+ img = PIL.Image.open(path)
+ # return callback
+ return partial(utils.resize, img)
+ except PIL.UnidentifiedImageError as err:
+ # failed to open, skip file
+ raise errors.UnsupportedFileFormatError(path) from err
+ except OSError as err:
+ raise errors.ReaderError(path) from err
+
+# EOF ##
diff --git a/bsie/reader/preview/_rawpy.py b/bsie/reader/preview/_rawpy.py
new file mode 100644
index 0000000..16e8675
--- /dev/null
+++ b/bsie/reader/preview/_rawpy.py
@@ -0,0 +1,61 @@
+
+# standard imports
+from functools import partial
+import typing
+
+# external imports
+import PIL.Image
+import rawpy
+
+# bsie imports
+from bsie.utils import errors, filematcher
+
+# inner-module imports
+from . import utils
+from .. import base
+
+# constants
+MATCH_RULE = 'mime={image/x-nikon-nef} | extension={nef}'
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'RawpyPreviewReader',
+ )
+
+
+## code ##
+
+class RawpyPreviewReader(base.Reader):
+ """Produce previews for raw image files using the rawpy library."""
+
+ # file matcher
+ _match: filematcher.Matcher
+
+ # additional kwargs to rawpy's postprocess
+ _rawpy_kwargs: typing.Dict[str, typing.Any]
+
+ def __init__(self, **rawpy_kwargs):
+ match_rule = rawpy_kwargs.pop('file_match_rule', MATCH_RULE)
+ self._match = filematcher.parse(match_rule)
+ self._rawpy_kwargs = rawpy_kwargs
+
+ def __call__(self, path: str) -> typing.Callable[[int], PIL.Image.Image]:
+ # perform quick checks first
+ if not self._match(path):
+ raise errors.UnsupportedFileFormatError(path)
+
+ try:
+ # open file with rawpy
+ ary = rawpy.imread(path).postprocess(**self._rawpy_kwargs)
+ # convert to PIL.Image
+ img = PIL.Image.fromarray(ary)
+ # return callback
+ return partial(utils.resize, img)
+
+ except (rawpy.LibRawFatalError, # pylint: disable=no-member # pylint doesn't find the errors
+ rawpy.NotSupportedError, # pylint: disable=no-member
+ rawpy.LibRawNonFatalError, # pylint: disable=no-member
+ ) as err:
+ raise errors.ReaderError(path) from err
+
+## EOF ##
diff --git a/bsie/reader/preview/utils.py b/bsie/reader/preview/utils.py
new file mode 100644
index 0000000..82ecc31
--- /dev/null
+++ b/bsie/reader/preview/utils.py
@@ -0,0 +1,34 @@
+
+# standard imports
+import typing
+
+# external imports
+import PIL.Image
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'resize',
+ )
+
+
+## code ##
+
+def resize(
+ img: PIL.Image.Image,
+ max_size: int,
+ ) -> PIL.Image.Image:
+ """Resize an image to a given maximum side length."""
+ # determine target dimensions
+ ratio = img.width / img.height
+ if img.width > img.height:
+ width, height = max_size, round(max_size / ratio)
+ else:
+ width, height = round(ratio * max_size), max_size
+ # rescale and return
+ return img.resize(
+ (width, height),
+ resample=PIL.Image.Resampling.LANCZOS, # create high-quality image
+ reducing_gap=3.0, # optimize computation via fast size reduction
+ )
+
+## EOF ##
diff --git a/bsie/reader/stat.py b/bsie/reader/stat.py
index fc5fb24..f42e7fb 100644
--- a/bsie/reader/stat.py
+++ b/bsie/reader/stat.py
@@ -1,15 +1,14 @@
"""The Stat reader produces filesystem stat information.
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import os
import typing
# bsie imports
-from bsie.base import errors, reader
+from bsie.utils import errors
+
+# inner-module imports
+from . import base
# exports
__all__: typing.Sequence[str] = (
@@ -19,7 +18,7 @@ __all__: typing.Sequence[str] = (
## code ##
-class Stat(reader.Reader):
+class Stat(base.Reader):
"""Read and return the filesystem's stat infos."""
def __call__(self, path: str) -> os.stat_result:
diff --git a/bsie/tools/__init__.py b/bsie/tools/__init__.py
deleted file mode 100644
index 803c321..0000000
--- a/bsie/tools/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
-"""
-# imports
-import typing
-
-# inner-module imports
-from . import builder
-from .pipeline import Pipeline
-
-# exports
-__all__: typing.Sequence[str] = (
- 'builder',
- 'Pipeline',
- )
-
-## EOF ##
diff --git a/bsie/tools/builder.py b/bsie/tools/builder.py
deleted file mode 100644
index 190d9bf..0000000
--- a/bsie/tools/builder.py
+++ /dev/null
@@ -1,226 +0,0 @@
-"""
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
-"""
-# imports
-import importlib
-import logging
-import typing
-
-# bsie imports
-from bsie import base
-from bsie.base import errors
-from bsie.utils import bsfs
-
-# inner-module imports
-from . import pipeline
-
-# exports
-__all__: typing.Sequence[str] = (
- 'ExtractorBuilder',
- 'PipelineBuilder',
- 'ReaderBuilder',
- )
-
-
-## code ##
-
-logger = logging.getLogger(__name__)
-
-def _safe_load(module_name: str, class_name: str):
- """Get a class from a module. Raise BuilderError if anything goes wrong."""
- try:
- # load the module
- module = importlib.import_module(module_name)
- except Exception as err:
- # cannot import module
- raise errors.LoaderError(f'cannot load module {module_name}') from err
-
- try:
- # get the class from the module
- cls = getattr(module, class_name)
- except Exception as err:
- # cannot find the class
- raise errors.LoaderError(f'cannot load class {class_name} from module {module_name}') from err
-
- return cls
-
-
-def _unpack_name(name):
- """Split a name into its module and class component (dot-separated)."""
- if not isinstance(name, str):
- raise TypeError(name)
- if '.' not in name:
- raise ValueError('name must be a qualified class name.')
- module_name, class_name = name[:name.rfind('.')], name[name.rfind('.')+1:]
- if module_name == '':
- raise ValueError('name must be a qualified class name.')
- return module_name, class_name
-
-
-class ReaderBuilder():
- """Build `bsie.base.Reader` instances.
-
- Readers are defined via their qualified class name
- (e.g., bsie.reader.path.Path) and optional keyword
- arguments that are passed to the constructor via
- the *kwargs* argument (name as key, kwargs as value).
- The ReaderBuilder keeps a cache of previously built
- reader instances, as they are anyway built with
- identical keyword arguments.
-
- """
-
- # keyword arguments
- _kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]
-
- # cached readers
- _cache: typing.Dict[str, base.Reader]
-
- def __init__(self, kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]):
- self._kwargs = kwargs
- self._cache = {}
-
- def build(self, name: str) -> base.Reader:
- """Return an instance for the qualified class name."""
- # return cached instance
- if name in self._cache:
- return self._cache[name]
-
- # check name and get module/class components
- module_name, class_name = _unpack_name(name)
-
- # import reader class
- cls = _safe_load(module_name, class_name)
-
- # get kwargs
- kwargs = self._kwargs.get(name, {})
- if not isinstance(kwargs, dict):
- raise TypeError(f'expected a kwargs dict, found {bsfs.typename(kwargs)}')
-
- try: # build, cache, and return instance
- obj = cls(**kwargs)
- # cache instance
- self._cache[name] = obj
- # return instance
- return obj
-
- except Exception as err:
- raise errors.BuilderError(f'failed to build reader {name} due to {bsfs.typename(err)}: {err}') from err
-
-
-class ExtractorBuilder():
- """Build `bsie.base.Extractor instances.
-
- It is permissible to build multiple instances of the same extractor
- (typically with different arguments), hence the ExtractorBuilder
- receives a list of build specifications. Each specification is
- a dict with a single key (extractor's qualified name) and a dict
- to be used as keyword arguments.
- Example: [{'bsie.extractor.generic.path.Path': {}}, ]
-
- """
-
- # build specifications
- _specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]
-
- def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]):
- self._specs = specs
-
- def __iter__(self) -> typing.Iterator[int]:
- """Iterate over extractor specifications."""
- return iter(range(len(self._specs)))
-
- def build(self, index: int) -> base.Extractor:
- """Return an instance of the n'th extractor (n=*index*)."""
- # get build instructions
- specs = self._specs[index]
-
- # check specs structure. expecting[{name: {kwargs}}]
- if not isinstance(specs, dict):
- raise TypeError(f'expected a dict, found {bsfs.typename(specs)}')
- if len(specs) != 1:
- raise TypeError(f'expected a dict of length one, found {len(specs)}')
-
- # get name and args from specs
- name = next(iter(specs.keys()))
- kwargs = specs[name]
-
- # check kwargs structure
- if not isinstance(kwargs, dict):
- raise TypeError(f'expected a dict, found {bsfs.typename(kwargs)}')
-
- # check name and get module/class components
- module_name, class_name = _unpack_name(name)
-
- # import extractor class
- cls = _safe_load(module_name, class_name)
-
- try: # build and return instance
- return cls(**kwargs)
-
- except Exception as err:
- raise errors.BuilderError(f'failed to build extractor {name} due to {bsfs.typename(err)}: {err}') from err
-
-
-class PipelineBuilder():
- """Build `bsie.tools.pipeline.Pipeline` instances."""
-
- # Prefix to be used in the Pipeline.
- prefix: bsfs.Namespace
-
- # builder for Readers.
- rbuild: ReaderBuilder
-
- # builder for Extractors.
- ebuild: ExtractorBuilder
-
- def __init__(
- self,
- prefix: bsfs.Namespace,
- reader_builder: ReaderBuilder,
- extractor_builder: ExtractorBuilder,
- ):
- self.prefix = prefix
- self.rbuild = reader_builder
- self.ebuild = extractor_builder
-
- def build(self) -> pipeline.Pipeline:
- """Return a Pipeline instance."""
- ext2rdr = {}
-
- for eidx in self.ebuild:
- # build extractor
- try:
- ext = self.ebuild.build(eidx)
-
- except errors.LoaderError as err: # failed to load extractor; skip
- logger.error('failed to load extractor: %s', err)
- continue
-
- except errors.BuilderError as err: # failed to build instance; skip
- logger.error(str(err))
- continue
-
- try:
- # get reader required by extractor
- if ext.CONTENT_READER is not None:
- rdr = self.rbuild.build(ext.CONTENT_READER)
- else:
- rdr = None
- # store extractor
- ext2rdr[ext] = rdr
-
- except errors.LoaderError as err: # failed to load reader
- logger.error('failed to load reader: %s', err)
-
- except errors.BuilderError as err: # failed to build reader
- logger.error(str(err))
-
- return pipeline.Pipeline(self.prefix, ext2rdr)
-
-
-
-## EOF ##
diff --git a/bsie/utils/__init__.py b/bsie/utils/__init__.py
index bd22236..18c8db7 100644
--- a/bsie/utils/__init__.py
+++ b/bsie/utils/__init__.py
@@ -1,22 +1,23 @@
"""Common tools and definitions.
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import typing
# inner-module imports
from . import bsfs
+from . import filematcher
from . import namespaces as ns
from . import node
+from .loading import safe_load, unpack_qualified_name
# exports
__all__: typing.Sequence[str] = (
'bsfs',
+ 'filematcher',
'node',
'ns',
+ 'safe_load',
+ 'unpack_qualified_name',
)
## EOF ##
diff --git a/bsie/utils/bsfs.py b/bsie/utils/bsfs.py
index 0b88479..fc045cc 100644
--- a/bsie/utils/bsfs.py
+++ b/bsie/utils/bsfs.py
@@ -1,10 +1,6 @@
"""BSFS bridge, provides BSFS bindings for BSIE.
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import typing
# bsfs imports
diff --git a/bsie/base/errors.py b/bsie/utils/errors.py
index dc3c30e..7c7e6ed 100644
--- a/bsie/base/errors.py
+++ b/bsie/utils/errors.py
@@ -1,10 +1,6 @@
"""Common BSIE exceptions.
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import typing
# exports
@@ -39,4 +35,10 @@ class ProgrammingError(_BSIEError):
class UnreachableError(ProgrammingError):
"""Bravo, you've reached a point in code that should logically not be reachable."""
+class ParserError(_BSIEError):
+ """Failed to parse due to invalid syntax or structures."""
+
+class UnsupportedFileFormatError(_BSIEError):
+ """Failed to read a file format."""
+
## EOF ##
diff --git a/bsie/utils/filematcher/__init__.py b/bsie/utils/filematcher/__init__.py
new file mode 100644
index 0000000..908de78
--- /dev/null
+++ b/bsie/utils/filematcher/__init__.py
@@ -0,0 +1,15 @@
+
+# standard imports
+import typing
+
+# inner-module imports
+from .matcher import Matcher
+from .parser import parse
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Matcher',
+ 'parse',
+ )
+
+## EOF ##
diff --git a/bsie/utils/filematcher/matcher.py b/bsie/utils/filematcher/matcher.py
new file mode 100644
index 0000000..1fa308e
--- /dev/null
+++ b/bsie/utils/filematcher/matcher.py
@@ -0,0 +1,174 @@
+
+# standard imports
+from collections.abc import Callable, Collection, Hashable
+import abc
+import os
+import typing
+
+# external imports
+import magic
+
+# exports
+__all__: typing.Sequence[str] = []
+
+
+## code ##
+
+# abstract nodes
+
+class Matcher(abc.ABC, Hashable, Callable, Collection): # type: ignore [misc] # Invalid base class Callable
+ """Matcher node base class."""
+
+ # child expressions or terminals
+ _childs: typing.Set[typing.Any]
+
+ def __init__(self, *childs: typing.Any):
+ if len(childs) == 1 and isinstance(childs[0], (list, tuple, set)):
+ self._childs = set(childs[0])
+ else:
+ self._childs = set(childs)
+
+ def __contains__(self, needle: typing.Any) -> bool:
+ return needle in self._childs
+
+ def __iter__(self) -> typing.Iterator[typing.Any]:
+ return iter(self._childs)
+
+ def __len__(self) -> int:
+ return len(self._childs)
+
+ def __repr__(self) -> str:
+ return f'{type(self).__name__}({self._childs})'
+
+ def __hash__(self) -> int:
+ return hash((type(self), tuple(set(self._childs))))
+
+ def __eq__(self, other: typing.Any) -> bool:
+ return isinstance(other, type(self)) \
+ and self._childs == other._childs
+
+ @abc.abstractmethod
+ def __call__(self, path: str) -> bool: # pylint: disable=arguments-differ
+ """Check if *path* satisfies the conditions set by the Matcher instance."""
+
+class NOT(Matcher):
+ """Invert a matcher result."""
+ def __init__(self, expr: Matcher):
+ super().__init__(expr)
+ def __call__(self, path: str) -> bool:
+ return not next(iter(self._childs))(path)
+
+# aggregate nodes
+
+class Aggregate(Matcher): # pylint: disable=too-few-public-methods # Yeah, it's an interface...
+ """Aggregation function base class (And, Or)."""
+
+class And(Aggregate):
+ """Accept only if all conditions are satisfied."""
+ def __call__(self, path: str) -> bool:
+ for itm in self:
+ if not itm(path):
+ return False
+ return True
+
+class Or(Aggregate):
+ """Accept only if at least one condition is satisfied."""
+ def __call__(self, path: str) -> bool:
+ for itm in self:
+ if itm(path):
+ return True
+ return False
+
+
+# criteria nodes
+
+class Criterion(Matcher):
+ """Criterion base class. Limits acceptance to certain values."""
+ def accepted(self) -> typing.Set[typing.Any]:
+ """Return a set of accepted values."""
+ return self._childs
+
+# criteria w/o value (valueless)
+
+class Any(Criterion):
+ """Accepts anything."""
+ def __call__(self, path: str) -> bool:
+ return True
+
+class Nothing(Criterion):
+ """Accepts nothing."""
+ def __call__(self, path: str) -> bool:
+ return False
+
+class Exists(Criterion):
+ """Filters by existence."""
+ def __call__(self, path: str) -> bool:
+ return os.path.exists(path)
+
+class IsFile(Criterion):
+ """Checks if the path is a regular file."""
+ def __call__(self, path: str) -> bool:
+ return os.path.isfile(path)
+
+class IsDir(Criterion):
+ """Checks if the path is a directory."""
+ def __call__(self, path: str) -> bool:
+ return os.path.isdir(path)
+
+class IsLink(Criterion):
+ """Checks if the path is a link."""
+ def __call__(self, path: str) -> bool:
+ return os.path.islink(path)
+
+class IsAbs(Criterion):
+ """Checks if the path is an absolute path."""
+ def __call__(self, path: str) -> bool:
+ return os.path.isabs(path)
+
+class IsRel(Criterion):
+ """Checks if the path is a relative path."""
+ def __call__(self, path: str) -> bool:
+ return not os.path.isabs(path)
+
+class IsMount(Criterion):
+ """Checks if the path is a mount point."""
+ def __call__(self, path: str) -> bool:
+ return os.path.ismount(path)
+
+class IsEmpty(Criterion):
+ """Checks if the path is an empty file."""
+ def __call__(self, path: str) -> bool:
+ return os.path.exists(path) and os.stat(path).st_size == 0
+
+class IsReadable(Criterion):
+ """Checks if the path is readable."""
+ def __call__(self, path: str) -> bool:
+ return os.path.exists(path) and os.access(path, os.R_OK)
+
+class IsWritable(Criterion):
+ """Checks if the path is writable."""
+ def __call__(self, path: str) -> bool:
+ return os.path.exists(path) and os.access(path, os.W_OK)
+
+class IsExecutable(Criterion):
+ """Checks if the path is executable."""
+ def __call__(self, path: str) -> bool:
+ return os.path.exists(path) and os.access(path, os.X_OK)
+
+# criteria w/ value
+
+class Extension(Criterion):
+ """Filters by file extension (without the dot)."""
+ def __call__(self, path: str) -> bool:
+ _, ext = os.path.splitext(path)
+ return ext[1:] in self.accepted()
+
+class Mime(Criterion):
+ """Filters by mime type."""
+ def __call__(self, path: str) -> bool:
+ try:
+ return magic.from_file(path, mime=True).lower() in self.accepted()
+ except FileNotFoundError:
+ return False
+
+## EOF ##
diff --git a/bsie/utils/filematcher/parser.py b/bsie/utils/filematcher/parser.py
new file mode 100644
index 0000000..dc28a0d
--- /dev/null
+++ b/bsie/utils/filematcher/parser.py
@@ -0,0 +1,141 @@
+
+# standard imports
+import typing
+
+# external imports
+import pyparsing
+from pyparsing import printables, alphas8bit, punc8bit, QuotedString, Word, \
+ delimitedList, Or, CaselessKeyword, Group, oneOf, Optional
+
+# inner-module imports
+from . import matcher
+from .. import errors
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'parse',
+ )
+
+
+## code ##
+
+class FileMatcherParser():
+ """
+ EXPR := RULES | RULES "|" RULES
+ RULESET := RULE | RULE, RULE
+ RULE := CRITERION OP VALUE | CRITERION OP {VALUES} | VALUELESS
+ OP := != | =
+ VALUES := VALUE | VALUE, VALUE
+ VALUE := [word]
+ CRITERION := mime | extension | ...
+ """
+
+ # criteria matcher nodes w/ arguments
+ _CRITERIA: typing.Dict[str, typing.Type[matcher.Matcher]] = {
+ 'extension': matcher.Extension,
+ 'mime': matcher.Mime,
+ }
+
+ # criteria matcher nodes w/o arguments
+ _VALUELESS: typing.Dict[str, typing.Type[matcher.Matcher]] = {
+ 'any': matcher.Any,
+ 'nothing': matcher.Nothing,
+ 'exists': matcher.Exists,
+ 'isfile': matcher.IsFile,
+ 'isdir': matcher.IsDir,
+ 'islink': matcher.IsLink,
+ 'isabs': matcher.IsAbs,
+ 'isrel': matcher.IsRel,
+ 'ismount': matcher.IsMount,
+ 'emtpy': matcher.IsEmpty,
+ 'readable': matcher.IsReadable,
+ 'writable': matcher.IsWritable,
+ 'executable': matcher.IsExecutable,
+ }
+
+ # pyparsing parser instance.
+ _parser: pyparsing.ParseExpression
+
+ def __init__(self):
+ # build the parser
+ # VALUE := [word]
+ alphabet = (printables + alphas8bit + punc8bit).translate(str.maketrans('', '', ',{}|='))
+ value = QuotedString(quoteChar='"', escChar='\\') ^ Word(alphabet)
+ # CRITERION := mime | extension | ...
+ criterion = Or([CaselessKeyword(p) for p in self._CRITERIA]).setResultsName('criterion')
+ valueless = Or([CaselessKeyword(p) for p in self._VALUELESS]).setResultsName('criterion')
+ # VALUES := VALUE | VALUE, VALUE
+ values = delimitedList(value, delim=',').setResultsName('value')
+ # OP := '=' | '!='
+ eqop = oneOf('= !=').setResultsName('op')
+ # RULE := CRITERION OP VALUE | CRITERION OP {VALUES} | VALUELESS
+ rule_none = Group(Optional('!').setResultsName('op') + valueless).setResultsName('rule_none')
+ rule_one = Group(criterion + eqop + value.setResultsName('value')).setResultsName('rule_one')
+ rule_few = Group(criterion + eqop + '{' + values + '}').setResultsName('rule_few')
+ # RULESET := RULE | RULE, RULE
+ ruleset = Group(delimitedList(rule_none ^ rule_one ^ rule_few, delim=','))
+ # EXPR := RULESET | RULESET \| RULESET
+ self._parser = delimitedList(ruleset, delim='|')
+
+ def parse(self, query: str) -> matcher.Matcher: # pylint: disable=too-many-branches
+ """Build a file matcher from a rule definition."""
+ # preprocess the query
+ query = query.strip()
+
+ # empty query
+ if len(query) == 0:
+ return matcher.Any()
+
+ try:
+ parsed = self._parser.parseString(query, parseAll=True)
+ except pyparsing.ParseException as err:
+ raise errors.ParserError(f'Cannot parse query {err}')
+
+ # convert to Matcher
+ rules = []
+ for exp in parsed:
+ tokens = []
+ for rule in exp:
+ # fetch accepted values
+ if rule.getName() == 'rule_none':
+ accepted = []
+ elif rule.getName() == 'rule_one':
+ accepted = [rule.value]
+ elif rule.getName() == 'rule_few':
+ accepted = list(rule.value)
+ else: # prevented by grammar
+ raise errors.UnreachableError('Invalid rule definition')
+
+ # build criterion
+ if rule.criterion in self._VALUELESS:
+ cls = self._VALUELESS[rule.criterion]
+ if rule.op == '!':
+ tokens.append(matcher.NOT(cls()))
+ else:
+ tokens.append(cls())
+ elif rule.criterion in self._CRITERIA:
+ cls = self._CRITERIA[rule.criterion]
+ if rule.op == '!=':
+ tokens.append(matcher.NOT(cls(accepted)))
+ else:
+ tokens.append(cls(accepted))
+ else: # prevented by grammar
+ raise errors.UnreachableError(f'Invalid condition "{rule.criterion}"')
+
+ # And-aggregate rules in one ruleset (if needed)
+ tokens = matcher.And(tokens) if len(tokens) > 1 else tokens[0]
+ rules.append(tokens)
+
+ # Or-aggregate rulesets
+ expr = matcher.Or(rules) if len(rules) > 1 else rules[0]
+
+ return expr
+
+# build default instance
+file_match_parser = FileMatcherParser()
+
+def parse(query: str) -> matcher.Matcher:
+ """Shortcut for FileMatcherParser()(query)."""
+ return file_match_parser.parse(query)
+
+## EOF ##
diff --git a/bsie/utils/loading.py b/bsie/utils/loading.py
new file mode 100644
index 0000000..58202d1
--- /dev/null
+++ b/bsie/utils/loading.py
@@ -0,0 +1,49 @@
+
+# standard imports
+import importlib
+import typing
+
+# inner-module imports
+from . import errors
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'safe_load',
+ 'unpack_qualified_name',
+ )
+
+
+## code ##
+
+def safe_load(module_name: str, class_name: str):
+ """Get a class from a module. Raise BuilderError if anything goes wrong."""
+ try:
+ # load the module
+ module = importlib.import_module(module_name)
+ except Exception as err:
+ # cannot import module
+ raise errors.LoaderError(f'cannot load module {module_name} ({err})') from err
+
+ try:
+ # get the class from the module
+ cls = getattr(module, class_name)
+ except Exception as err:
+ # cannot find the class
+ raise errors.LoaderError(f'cannot load class {class_name} from module {module_name} ({err})') from err
+
+ return cls
+
+
+def unpack_qualified_name(name):
+ """Split a name into its module and class component (dot-separated)."""
+ if not isinstance(name, str):
+ raise TypeError(name)
+ if '.' not in name:
+ raise ValueError('name must be a qualified class name.')
+ module_name, class_name = name[:name.rfind('.')], name[name.rfind('.')+1:]
+ if module_name == '':
+ raise ValueError('name must be a qualified class name.')
+ return module_name, class_name
+
+
+## EOF ##
diff --git a/bsie/utils/namespaces.py b/bsie/utils/namespaces.py
index a29fc1b..4a66048 100644
--- a/bsie/utils/namespaces.py
+++ b/bsie/utils/namespaces.py
@@ -1,26 +1,37 @@
"""Default namespaces used throughout BSIE.
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import typing
# inner-module imports
from . import bsfs as _bsfs
-# constants
-bse = _bsfs.Namespace('http://bsfs.ai/schema/Entity')
-bsfs = _bsfs.Namespace('http://bsfs.ai/schema', fsep='/')
-bsm = _bsfs.Namespace('http://bsfs.ai/schema/Meta')
-xsd = _bsfs.Namespace('http://www.w3.org/2001/XMLSchema')
+# generic namespaces
+xsd = _bsfs.Namespace('http://www.w3.org/2001/XMLSchema')()
+
+# core bsfs/bsie namespaces
+bsfs = _bsfs.Namespace('https://schema.bsfs.io/core')
+bsie = _bsfs.Namespace('https://schema.bsfs.io/ie')
+
+# auxiliary namespaces
+bsd = bsie.distance()
+bse = bsie.Node.Entity()
+bsf = bsie.Literal.Array.Feature
+bsl = bsfs.Literal
+bsn = bsie.Node
+bsp = bsie.Node.Preview()
# export
__all__: typing.Sequence[str] = (
+ 'bsd',
'bse',
+ 'bsf',
'bsfs',
- 'bsm',
+ 'bsie',
+ 'bsl',
+ 'bsl',
+ 'bsn',
+ 'bsp',
'xsd',
)
diff --git a/bsie/utils/node.py b/bsie/utils/node.py
index ecf39cd..fa34b2e 100644
--- a/bsie/utils/node.py
+++ b/bsie/utils/node.py
@@ -1,10 +1,6 @@
"""Lighweight Node to bridge to BSFS.
-
-Part of the bsie module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import typing
# bsie imports
@@ -19,30 +15,47 @@ __all__: typing.Sequence[str] = (
## code ##
class Node():
- """Lightweight Node, disconnected from any bsfs structures."""
+ """Lightweight Node, disconnected from any bsfs structures.
+
+ In most cases, provide *hints* and leave setting the uri to a node
+ naming policy. Only provide an *uri* if it is absolutely determined.
+
+ """
# node type.
node_type: bsfs.URI
# node URI.
- uri: bsfs.URI
+ uri: typing.Optional[bsfs.URI]
+
+ # node naming hints.
+ hits: dict
def __init__(
self,
node_type: bsfs.URI,
- uri: bsfs.URI,
+ uri: typing.Optional[bsfs.URI] = None,
+ **uri_hints,
):
# assign members
self.node_type = bsfs.URI(node_type)
- self.uri = bsfs.URI(uri)
+ self.hints = uri_hints
+ self.uri = uri
def __eq__(self, other: typing.Any) -> bool:
+ """Compare two Node instances based on type and uri.
+ Compares hits only if the uri is not yet specified.
+ """
return isinstance(other, Node) \
and other.node_type == self.node_type \
- and other.uri == self.uri
+ and other.uri == self.uri \
+ and (self.uri is not None or self.hints == other.hints)
def __hash__(self) -> int:
- return hash((type(self), self.node_type, self.uri))
+ identifier = self.uri
+ if identifier is None:
+ identifier = tuple((key, self.hints[key]) for key in sorted(self.hints))
+ return hash((type(self), self.node_type, identifier))
def __str__(self) -> str:
return f'{bsfs.typename(self)}({self.node_type}, {self.uri})'