diff options
Diffstat (limited to 'bsie')
49 files changed, 1998 insertions, 515 deletions
diff --git a/bsie/__init__.py b/bsie/__init__.py index 8d2308c..f6f2ff2 100644 --- a/bsie/__init__.py +++ b/bsie/__init__.py @@ -1,10 +1,6 @@ """The BSIE module extracts triples from files for insertion into a BSFS storage. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import collections import typing diff --git a/bsie/apps/__init__.py b/bsie/apps/__init__.py index a548c3c..2fe4795 100644 --- a/bsie/apps/__init__.py +++ b/bsie/apps/__init__.py @@ -1,12 +1,13 @@ +#!/usr/bin/env python3 +"""BSIE tools. """ - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports +# standard imports +import argparse import typing +# bsie imports +import bsie + # inner-module imports from .index import main as index from .info import main as info @@ -15,6 +16,39 @@ from .info import main as info __all__: typing.Sequence[str] = ( 'index', 'info', + 'main', ) +# config +apps = { + 'index' : index, + 'info' : info, + } + + +## code ## + +def main(argv=None): + """Black Star File System maintenance tools.""" + parser = argparse.ArgumentParser(description=main.__doc__, prog='bsie') + # version + parser.add_argument('--version', action='version', + version='%(prog)s version {}.{}.{}'.format(*bsie.version_info)) # pylint: disable=C0209 + # application selection + parser.add_argument('app', choices=apps.keys(), + help='Select the application to run.') + # dangling args + parser.add_argument('rest', nargs=argparse.REMAINDER) + # parse + args = parser.parse_args(argv) + # run application + apps[args.app](args.rest) + + +## main ## + +if __name__ == '__main__': + import sys + main(sys.argv[1:]) + ## EOF ## diff --git a/bsie/apps/_loader.py b/bsie/apps/_loader.py new file mode 100644 index 0000000..6411f10 --- /dev/null +++ b/bsie/apps/_loader.py @@ -0,0 +1,47 @@ + +# standard imports +import typing + +# external imports +import yaml + +# bsie imports +from bsie.extractor import ExtractorBuilder +from bsie.lib import PipelineBuilder +from bsie.lib.pipeline import Pipeline +from bsie.reader import ReaderBuilder + +# constants +DEFAULT_CONFIG_FILE = 'default_config.yaml' + +# exports +__all__: typing.Sequence[str] = ( + 'DEFAULT_CONFIG_FILE', + 'load_pipeline', + ) + + +## code ## + +def load_pipeline(path: str) -> Pipeline: + """Load a pipeline according to a config at *path*.""" + # load config file + with open(path, 'rt', encoding='utf-8') as ifile: + cfg = yaml.safe_load(ifile) + + # reader builder + rbuild = ReaderBuilder(cfg['ReaderBuilder']) + # extractor builder + ebuild = ExtractorBuilder(cfg['ExtractorBuilder']) + # pipeline builder + pbuild = PipelineBuilder( + rbuild, + ebuild, + ) + # build pipeline + pipeline = pbuild.build() + + # return pipeline + return pipeline + +## EOF ## diff --git a/bsie/apps/default_config.yaml b/bsie/apps/default_config.yaml new file mode 100644 index 0000000..a59b0f3 --- /dev/null +++ b/bsie/apps/default_config.yaml @@ -0,0 +1,19 @@ + +ReaderBuilder: {} + +ExtractorBuilder: + + - bsie.extractor.preview.Preview: + max_sides: [50, 100, 200,400] + + - bsie.extractor.generic.path.Path: {} + + - bsie.extractor.generic.stat.Stat: {} + + - bsie.extractor.image.colors_spatial.ColorsSpatial: + width: 32 + height: 32 + exp: 4 + + - bsie.extractor.image.photometrics.Exif: {} + diff --git a/bsie/apps/index.py b/bsie/apps/index.py index 1dbfdd8..d64e8c2 100644 --- a/bsie/apps/index.py +++ b/bsie/apps/index.py @@ -1,19 +1,15 @@ -""" -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports +# standard imports import argparse import os import typing # bsie imports -from bsie.base import errors -from bsie.lib import BSIE -from bsie.tools import builder -from bsie.utils import bsfs +from bsie.lib import BSIE, DefaultNamingPolicy +from bsie.utils import bsfs, errors, node as node_ + +# inner-module imports +from . import _loader # exports __all__: typing.Sequence[str] = ( @@ -26,7 +22,12 @@ __all__: typing.Sequence[str] = ( def main(argv): """Index files or directories into BSFS.""" parser = argparse.ArgumentParser(description=main.__doc__, prog='index') - parser.add_argument('--user', type=bsfs.URI, default=bsfs.URI('http://example.com/me'), + parser.add_argument('--config', type=str, + default=os.path.join(os.path.dirname(__file__), _loader.DEFAULT_CONFIG_FILE), + help='Path to the config file.') + parser.add_argument('--host', type=bsfs.URI, default=bsfs.URI('http://example.com'), + help='') + parser.add_argument('--user', type=str, default='me', help='') parser.add_argument('--collect', action='append', default=[], help='') @@ -42,35 +43,15 @@ def main(argv): help='') args = parser.parse_args(argv) - # FIXME: Read reader/extractor configs from a config file - # reader builder - rbuild = builder.ReaderBuilder({}) - # extractor builder - ebuild = builder.ExtractorBuilder([ - {'bsie.extractor.generic.path.Path': {}}, - {'bsie.extractor.generic.stat.Stat': {}}, - {'bsie.extractor.generic.constant.Constant': dict( - tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')], - schema=''' - bse:author rdfs:subClassOf bsfs:Predicate ; - rdfs:domain bsfs:Entity ; - rdfs:range xsd:string ; - bsfs:unique "true"^^xsd:boolean . - ''', - )}, - ]) - # pipeline builder - pbuild = builder.PipelineBuilder( - bsfs.Namespace(args.user + ('/' if not args.user.endswith('/') else '')), - rbuild, - ebuild, - ) - # build pipeline - pipeline = pbuild.build() + pipeline = _loader.load_pipeline(args.config) + # build the naming policy + naming_policy = DefaultNamingPolicy( + host=args.host, + user=args.user, + ) # build BSIE frontend - bsie = BSIE(pipeline, args.collect, args.discard) - + bsie = BSIE(pipeline, naming_policy, args.collect, args.discard) def walk(handle): """Walk through given input files.""" @@ -78,11 +59,12 @@ def main(argv): # FIXME: simplify code (below but maybe also above) # FIXME: How to handle dependencies between data? # E.g. do I still want to link to a tag despite not being permitted to set its label? - # FIXME: node renaming? # index input paths for path in args.input_file: - if os.path.isdir(path) and args.recursive: + if not os.path.exists(path): + pass # FIXME: notify the user + elif os.path.isdir(path) and args.recursive: for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow): for filename in filenames: for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)): @@ -105,13 +87,14 @@ def main(argv): store.migrate(bsie.schema) # process files def handle(node, pred, value): + if isinstance(value, node_.Node): + value = store.node(value.node_type, value.uri) store.node(node.node_type, node.uri).set(pred.uri, value) walk(handle) # return store return store - ## main ## if __name__ == '__main__': diff --git a/bsie/apps/info.py b/bsie/apps/info.py index eaf1f71..e27b70b 100644 --- a/bsie/apps/info.py +++ b/bsie/apps/info.py @@ -1,18 +1,15 @@ -""" -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports +# standard imports import argparse +import os import sys import typing # bsie imports -from bsie.base import errors -from bsie.tools import builder -from bsie.utils import bsfs +from bsie.utils import bsfs, errors + +# inner-module imports +from . import _loader # exports __all__: typing.Sequence[str] = ( @@ -25,42 +22,24 @@ __all__: typing.Sequence[str] = ( def main(argv): """Show information from BSIE.""" parser = argparse.ArgumentParser(description=main.__doc__, prog='info') - parser.add_argument('what', choices=('predicates', ), + parser.add_argument('--config', type=str, + default=os.path.join(os.path.dirname(__file__), _loader.DEFAULT_CONFIG_FILE), + help='Path to the config file.') + parser.add_argument('what', choices=('predicates', 'schema'), help='Select what information to show.') args = parser.parse_args(argv) - # FIXME: Read reader/extractor configs from a config file - # reader builder - rbuild = builder.ReaderBuilder({}) - # extractor builder - ebuild = builder.ExtractorBuilder([ - {'bsie.extractor.generic.path.Path': {}}, - {'bsie.extractor.generic.stat.Stat': {}}, - {'bsie.extractor.generic.constant.Constant': dict( - tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')], - schema=''' - bse:author rdfs:subClassOf bsfs:Predicate ; - rdfs:domain bsfs:Entity ; - rdfs:range xsd:string ; - bsfs:unique "true"^^xsd:boolean . - ''', - )}, - ]) - # pipeline builder - pbuild = builder.PipelineBuilder( - bsfs.Namespace('http://example.com/me/'), # not actually used - rbuild, - ebuild, - ) - # build pipeline - pipeline = pbuild.build() + pipeline = _loader.load_pipeline(args.config) # show info if args.what == 'predicates': # show predicates for pred in pipeline.schema.predicates(): print(pred.uri) + elif args.what == 'schema': + # show schema + print(bsfs.schema.to_string(pipeline.schema)) else: # args.what is already checked by argparse raise errors.UnreachableError() diff --git a/bsie/base/__init__.py b/bsie/base/__init__.py deleted file mode 100644 index 0d362cd..0000000 --- a/bsie/base/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -"""The base module defines the BSIE interfaces. - -You'll mostly find abstract classes here. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports -import typing - -# inner-module imports -from . import errors -from .extractor import Extractor -from .reader import Reader - -# exports -__all__: typing.Sequence[str] = ( - 'Extractor', - 'Reader', - 'errors', - ) - -## EOF ## diff --git a/bsie/extractor/__init__.py b/bsie/extractor/__init__.py index ef31343..36fa9ba 100644 --- a/bsie/extractor/__init__.py +++ b/bsie/extractor/__init__.py @@ -2,14 +2,18 @@ Each Extractor class is linked to the Reader class whose content it requires. -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing +# inner-module imports +from .base import Extractor +from .builder import ExtractorBuilder + # exports -__all__: typing.Sequence[str] = [] +__all__: typing.Sequence[str] = ( + 'Extractor', + 'ExtractorBuilder', + ) ## EOF ## diff --git a/bsie/base/extractor.py b/bsie/extractor/base.py index c44021b..f92d7cc 100644 --- a/bsie/base/extractor.py +++ b/bsie/extractor/base.py @@ -1,10 +1,6 @@ """The Extractor classes transform content into triples. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import abc import typing @@ -28,16 +24,32 @@ SCHEMA_PREAMBLE = ''' prefix schema: <http://schema.org/> # common bsfs prefixes - prefix bsfs: <http://bsfs.ai/schema/> - prefix bse: <http://bsfs.ai/schema/Entity#> + prefix bsfs: <https://schema.bsfs.io/core/> + prefix bsl: <https://schema.bsfs.io/core/Literal/> + prefix bsa: <https://schema.bsfs.io/core/Literal/Array/> + prefix bsd: <https://schema.bsfs.io/core/distance#> + + prefix bsie: <https://schema.bsfs.io/ie/> + prefix bsn: <https://schema.bsfs.io/ie/Node/> + prefix bse: <https://schema.bsfs.io/ie/Node/Entity#> + prefix bsp: <https://schema.bsfs.io/ie/Node/Preview#> + + # default definitions + bsl:Array rdfs:subClassOf bsfs:Literal . + bsl:Number rdfs:subClassOf bsfs:Literal . + bsl:Time rdfs:subClassOf bsfs:Literal . + bsa:Feature rdfs:subClassOf bsl:Array ; + bsfs:dimension "1"^^xsd:integer ; + bsfs:dtype <https://schema.bsfs.io/core/dtype#f16> ; + bsfs:distance bsd:euclidean . # essential nodes - bsfs:Entity rdfs:subClassOf bsfs:Node . - bsfs:File rdfs:subClassOf bsfs:Entity . + bsn:Entity rdfs:subClassOf bsfs:Node . # common definitions xsd:string rdfs:subClassOf bsfs:Literal . - xsd:integer rdfs:subClassOf bsfs:Literal . + xsd:integer rdfs:subClassOf bsl:Number . + xsd:float rdfs:subClassOf bsl:Number . ''' @@ -83,7 +95,7 @@ class Extractor(abc.ABC): @property def principals(self) -> typing.Iterator[bsfs.schema.Predicate]: """Return the principal predicates, i.e., relations from/to the extraction subject.""" - ent = self.schema.node(ns.bsfs.Entity) + ent = self.schema.node(ns.bsn.Entity) return ( pred for pred @@ -99,5 +111,6 @@ class Extractor(abc.ABC): principals: typing.Iterable[bsfs.schema.Predicate], ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: """Return (node, predicate, value) triples.""" + # FIXME: type annotation could be more strict: value is Hashable ## EOF ## diff --git a/bsie/extractor/builder.py b/bsie/extractor/builder.py new file mode 100644 index 0000000..d691b0e --- /dev/null +++ b/bsie/extractor/builder.py @@ -0,0 +1,72 @@ + +# standard imports +import typing + +# bsie imports +from bsie.utils import bsfs, errors, safe_load, unpack_qualified_name + +# inner-module imports +from . import base + +# exports +__all__: typing.Sequence[str] = ( + 'ExtractorBuilder', + ) + + +## code ## + +class ExtractorBuilder(): + """Build `bsie.base.Extractor instances. + + It is permissible to build multiple instances of the same extractor + (typically with different arguments), hence the ExtractorBuilder + receives a list of build specifications. Each specification is + a dict with a single key (extractor's qualified name) and a dict + to be used as keyword arguments. + Example: [{'bsie.extractor.generic.path.Path': {}}, ] + + """ + + # build specifications + _specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]] + + def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]): + self._specs = specs + + def __iter__(self) -> typing.Iterator[int]: + """Iterate over extractor specifications.""" + return iter(range(len(self._specs))) + + def build(self, index: int) -> base.Extractor: + """Return an instance of the n'th extractor (n=*index*).""" + # get build instructions + specs = self._specs[index] + + # check specs structure. expecting[{name: {kwargs}}] + if not isinstance(specs, dict): + raise TypeError(f'expected a dict, found {bsfs.typename(specs)}') + if len(specs) != 1: + raise TypeError(f'expected a dict of length one, found {len(specs)}') + + # get name and args from specs + name = next(iter(specs.keys())) + kwargs = specs[name] + + # check kwargs structure + if not isinstance(kwargs, dict): + raise TypeError(f'expected a dict, found {bsfs.typename(kwargs)}') + + # check name and get module/class components + module_name, class_name = unpack_qualified_name(name) + + # import extractor class + cls = safe_load(module_name, class_name) + + try: # build and return instance + return cls(**kwargs) + + except Exception as err: + raise errors.BuilderError(f'failed to build extractor {name} due to {bsfs.typename(err)}: {err}') from err + +## EOF ## diff --git a/bsie/extractor/generic/__init__.py b/bsie/extractor/generic/__init__.py index 0cb7e7f..46a4bd6 100644 --- a/bsie/extractor/generic/__init__.py +++ b/bsie/extractor/generic/__init__.py @@ -3,11 +3,8 @@ files. Examples include file system information (file name and size, mime type, etc.) and information that is independent of the actual file (constant triples, host platform infos, current time, etc.). -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # exports diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py index 11384e6..7acbe95 100644 --- a/bsie/extractor/generic/constant.py +++ b/bsie/extractor/generic/constant.py @@ -1,16 +1,14 @@ """The Constant extractor produces pre-specified triples. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # bsie imports -from bsie.base import extractor from bsie.utils import bsfs, node +# inner-module imports +from .. import base + # exports __all__: typing.Sequence[str] = ( 'Constant', @@ -19,7 +17,7 @@ __all__: typing.Sequence[str] = ( ## code ## -class Constant(extractor.Extractor): +class Constant(base.Extractor): """Extract information from file's path.""" CONTENT_READER = None @@ -32,7 +30,7 @@ class Constant(extractor.Extractor): schema: str, tuples: typing.Iterable[typing.Tuple[bsfs.URI, typing.Any]], ): - super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + schema)) + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + schema)) # NOTE: Raises a KeyError if the predicate is not part of the schema self._tuples = tuple((self.schema.predicate(p_uri), value) for p_uri, value in tuples) # TODO: use schema instance for value checking diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py index 7018e12..00c1121 100644 --- a/bsie/extractor/generic/path.py +++ b/bsie/extractor/generic/path.py @@ -1,15 +1,10 @@ -""" -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports +# standard imports import os import typing # bsie imports -from bsie.base import extractor +from bsie.extractor import base from bsie.utils import bsfs, node, ns # exports @@ -20,7 +15,7 @@ __all__: typing.Sequence[str] = ( ## code ## -class Path(extractor.Extractor): +class Path(base.Extractor): """Extract information from file's path.""" CONTENT_READER = 'bsie.reader.path.Path' @@ -29,13 +24,13 @@ class Path(extractor.Extractor): _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]] def __init__(self): - super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + ''' bse:filename rdfs:subClassOf bsfs:Predicate ; - rdfs:domain bsfs:File ; + rdfs:domain bsn:Entity ; rdfs:range xsd:string ; rdfs:label "File name"^^xsd:string ; schema:description "Filename of entity in some filesystem."^^xsd:string ; - bsfs:unique "false"^^xsd:boolean . + bsfs:unique "true"^^xsd:boolean . ''')) self._callmap = { self.schema.predicate(ns.bse.filename): self.__filename, diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py index 0b9ce29..92b51f3 100644 --- a/bsie/extractor/generic/stat.py +++ b/bsie/extractor/generic/stat.py @@ -1,17 +1,15 @@ """Extract information from the file system, such as filesize. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import os import typing # bsie imports -from bsie.base import extractor from bsie.utils import bsfs, node, ns +# inner-module imports +from .. import base + # exports __all__: typing.Sequence[str] = ( 'Stat', @@ -20,7 +18,7 @@ __all__: typing.Sequence[str] = ( ## code ## -class Stat(extractor.Extractor): +class Stat(base.Extractor): """Extract information from the file system.""" CONTENT_READER = 'bsie.reader.stat.Stat' @@ -29,13 +27,13 @@ class Stat(extractor.Extractor): _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[os.stat_result], typing.Any]] def __init__(self): - super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + ''' bse:filesize rdfs:subClassOf bsfs:Predicate ; - rdfs:domain bsfs:File ; + rdfs:domain bsn:Entity ; rdfs:range xsd:integer ; rdfs:label "File size"^^xsd:string ; schema:description "File size of entity in some filesystem."^^xsd:string ; - bsfs:unique "false"^^xsd:boolean . + bsfs:unique "true"^^xsd:boolean . ''')) self._callmap = { self.schema.predicate(ns.bse.filesize): self.__filesize, diff --git a/bsie/extractor/image/__init__.py b/bsie/extractor/image/__init__.py new file mode 100644 index 0000000..f82424a --- /dev/null +++ b/bsie/extractor/image/__init__.py @@ -0,0 +1,8 @@ + +# standard imports +import typing + +# exports +__all__: typing.Sequence[str] = [] + +## EOF ## diff --git a/bsie/extractor/image/colors_spatial.py b/bsie/extractor/image/colors_spatial.py new file mode 100644 index 0000000..e6661a9 --- /dev/null +++ b/bsie/extractor/image/colors_spatial.py @@ -0,0 +1,150 @@ +"""Spatial color features. +""" +# standard imports +import typing + +# external imports +import PIL.Image +import numpy as np + +# bsie imports +from bsie.utils import bsfs, node, ns + +# inner-module imports +from .. import base + +# constants +FEATURE_NAME = ns.bsf.ColorsSpatial() + +# exports +__all__: typing.Sequence[str] = ( + 'ColorsSpatial', + ) + + +## code ## + +class ColorsSpatial(base.Extractor): + """Determine dominant colors of subregions in the image. + + Computes the domiant color of increasingly smaller subregions of the image. + """ + + CONTENT_READER = 'bsie.reader.image.Image' + + # Initial subregion width. + width: int + + # Initial subregion height. + height: int + + # Decrement exponent. + exp: float + + # Principal predicate's URI. + _predicate_name: bsfs.URI + + def __init__( + self, + width: int = 32, + height: int = 32, + exp: float = 4., + ): + # instance identifier + uuid = bsfs.uuid.UCID.from_dict({ + 'width': width, + 'height': height, + 'exp': exp, + }) + # determine symbol names + instance_name = getattr(FEATURE_NAME, uuid) + predicate_name = getattr(ns.bse, 'colors_spatial_' + uuid) + # get vector dimension + dimension = self.dimension(width, height, exp) + # initialize parent with the schema + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + f''' + <{FEATURE_NAME}> rdfs:subClassOf bsa:Feature ; + # annotations + rdfs:label "Spatially dominant colors"^^xsd:string ; + schema:description "Domiant colors of subregions in an image."^^xsd:string ; + bsfs:distance <https://schema.bsfs.io/core/distance#euclidean> ; + bsfs:dtype xsd:integer . + + <{instance_name}> rdfs:subClassOf <{FEATURE_NAME}> ; + bsfs:dimension "{dimension}"^^xsd:integer ; + # annotations + <{FEATURE_NAME}/args#width> "{width}"^^xsd:integer ; + <{FEATURE_NAME}/args#height> "{height}"^^xsd:integer ; + <{FEATURE_NAME}/args#exp> "{exp}"^^xsd:float . + + <{predicate_name}> rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range <{instance_name}> ; + bsfs:unique "true"^^xsd:boolean . + + ''')) + # assign extra members + self.width = width + self.height = height + self.exp = exp + self._predicate_name = predicate_name + + def __repr__(self) -> str: + return f'{bsfs.typename(self)}({self.width}, {self.height}, {self.exp})' + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) \ + and self.width == other.width \ + and self.height == other.height \ + and self.exp == other.exp + + def __hash__(self) -> int: + return hash((super().__hash__(), self.width, self.height, self.exp)) + + @staticmethod + def dimension(width: int, height: int, exp: float) -> int: + """Return the feature vector dimension.""" + # FIXME: replace with a proper formula + dim = 0 + while width >= 1 and height >= 1: + dim += width * height + width = np.floor(width / exp) + height = np.floor(height / exp) + dim *= 3 # per band + return int(dim) + + def extract( + self, + subject: node.Node, + content: PIL.Image.Image, + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + # check principals + if self.schema.predicate(self._predicate_name) not in principals: + # nothing to do; abort + return + + # convert to HSV + content = content.convert('HSV') + + # get dimensions + width, height = self.width, self.height + num_bands = len(content.getbands()) # it's three since we converted to HSV before + + features = [] + while width >= 1 and height >= 1: + # downsample + img = content.resize((width, height), resample=PIL.Image.Resampling.BOX) + # feature vector + features.append( + np.array(img.getdata()).reshape((width * height, num_bands))) + # iterate + width = int(np.floor(width / self.exp)) + height = int(np.floor(height / self.exp)) + + # combine bands and convert features to tuple + value = tuple(np.vstack(features).reshape(-1)) + # return triple with feature vector as value + yield subject, self.schema.predicate(self._predicate_name), value + +## EOF ## diff --git a/bsie/extractor/image/photometrics.py b/bsie/extractor/image/photometrics.py new file mode 100644 index 0000000..42eb3c8 --- /dev/null +++ b/bsie/extractor/image/photometrics.py @@ -0,0 +1,211 @@ + +# standard imports +from fractions import Fraction +import typing + +# bsie imports +from bsie.utils import bsfs, node, ns + +# inner-module imports +from .. import base + +# exports +__all__: typing.Sequence[str] = ( + 'Exif', + ) + + +## code ## + +def _gps_to_dec(coords: typing.Tuple[float, float, float]) -> float: + """Convert GPS coordinates from exif to float.""" + # unpack args + deg, min, sec = coords # pylint: disable=redefined-builtin # min + # convert to float + deg = float(Fraction(deg)) + min = float(Fraction(min)) + sec = float(Fraction(sec)) + + if float(sec) > 0: + # format is deg+min+sec + return (float(deg) * 3600 + float(min) * 60 + float(sec)) / 3600 + # format is deg+min + return float(deg) + float(min) / 60 + + +class Exif(base.Extractor): + """Extract information from EXIF/IPTC tags of an image file.""" + + CONTENT_READER = 'bsie.reader.exif.Exif' + + def __init__(self): + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + ''' + #bse:t_capture rdfs:subClassOf bsfs:Predicate ; + # rdfs:domain bsn:Entity ; + # rdfs:range xsd:float ; + # bsfs:unique "true"^^xsd:boolean . + bse:exposure rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:float ; + bsfs:unique "true"^^xsd:boolean . + bse:aperture rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:float ; + bsfs:unique "true"^^xsd:boolean . + bse:iso rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + bse:focal_length rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:float ; + bsfs:unique "true"^^xsd:boolean . + bse:width rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + bse:height rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + bse:orientation rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + bse:orientation_label rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:string ; + bsfs:unique "true"^^xsd:boolean . + bse:altitude rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:float ; + bsfs:unique "true"^^xsd:boolean . + bse:latitude rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:float ; + bsfs:unique "true"^^xsd:boolean . + bse:longitude rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:float ; + bsfs:unique "true"^^xsd:boolean . + ''')) + # initialize mapping from predicate to callback + self._callmap = { + #self.schema.predicate(ns.bse.t_capture): self._date, + self.schema.predicate(ns.bse.exposure): self._exposure, + self.schema.predicate(ns.bse.aperture): self._aperture, + self.schema.predicate(ns.bse.iso): self._iso, + self.schema.predicate(ns.bse.focal_length): self._focal_length, + self.schema.predicate(ns.bse.width): self._width, + self.schema.predicate(ns.bse.height): self._height, + self.schema.predicate(ns.bse.orientation): self._orientation, + self.schema.predicate(ns.bse.orientation_label): self._orientation_label, + self.schema.predicate(ns.bse.altitude): self._altitude, + self.schema.predicate(ns.bse.latitude): self._latitude, + self.schema.predicate(ns.bse.longitude): self._longitude, + } + + def extract( + self, + subject: node.Node, + content: dict, + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + for pred in principals: + # find callback + clbk = self._callmap.get(pred) + if clbk is None: + continue + # get value + value = clbk(content) + if value is None: + continue + # produce triple + yield subject, pred, value + + #def _date(self, content: dict): # FIXME: Return type annotation + # date_keys = ( + # 'Exif.Photo.DateTimeOriginal', + # 'Exif.Photo.DateTimeDigitized', + # 'Exif.Image.DateTime', + # ) + # for key in date_keys: + # if key in content: + # dt = content[key].value + # if dt.tzinfo is None: + # dt = dt.replace(tzinfo=ttime.NoTimeZone) + # return dt + # return None + + + ## photometrics + + def _exposure(self, content: dict) -> typing.Optional[float]: + if 'Exif.Photo.ExposureTime' in content: + return 1.0 / float(Fraction(content['Exif.Photo.ExposureTime'])) + return None + + def _aperture(self, content: dict) -> typing.Optional[float]: + if 'Exif.Photo.FNumber' in content: + return float(Fraction(content['Exif.Photo.FNumber'])) + return None + + def _iso(self, content: dict) -> typing.Optional[int]: + if 'Exif.Photo.ISOSpeedRatings' in content: + return int(content['Exif.Photo.ISOSpeedRatings']) + return None + + def _focal_length(self, content: dict) -> typing.Optional[float]: + if 'Exif.Photo.FocalLength' in content: + return float(Fraction(content['Exif.Photo.FocalLength'])) + return None + + + ## image dimensions + + def _width(self, content: dict) -> typing.Optional[int]: + # FIXME: consider orientation! + if 'Exif.Photo.PixelXDimension' in content: + return int(content['Exif.Photo.PixelXDimension']) + return None + + def _height(self, content: dict) -> typing.Optional[int]: + # FIXME: consider orientation! + if 'Exif.Photo.PixelYDimension' in content: + return int(content['Exif.Photo.PixelYDimension']) + return None + + def _orientation(self, content: dict) -> typing.Optional[int]: + if 'Exif.Image.Orientation' in content: + return int(content['Exif.Image.Orientation']) + return None + + def _orientation_label(self, content: dict) -> typing.Optional[str]: + width = self._width(content) + height = self._height(content) + ori = self._orientation(content) + if width is not None and height is not None and ori is not None: + if ori <= 4: + return 'landscape' if width >= height else 'portrait' + return 'portrait' if width >= height else 'landscape' + return None + + + ## location + + def _altitude(self, content: dict) -> typing.Optional[float]: + if 'Exif.GPSInfo.GPSAltitude' in content: + return float(Fraction(content['Exif.GPSInfo.GPSAltitude'])) + return None + + def _latitude(self, content: dict) -> typing.Optional[float]: + if 'Exif.GPSInfo.GPSLatitude' in content: + return _gps_to_dec(content['Exif.GPSInfo.GPSLatitude'].split()) + return None + + def _longitude(self, content: dict) -> typing.Optional[float]: + if 'Exif.GPSInfo.GPSLongitude' in content: + return _gps_to_dec(content['Exif.GPSInfo.GPSLongitude'].split()) + return None + +## EOF ## diff --git a/bsie/extractor/preview.py b/bsie/extractor/preview.py new file mode 100644 index 0000000..145a01a --- /dev/null +++ b/bsie/extractor/preview.py @@ -0,0 +1,96 @@ + +# imports +import io +import typing + +# external imports +import PIL.Image + +# bsie imports +from bsie.utils import bsfs, node, ns + +# inner-module imports +from . import base + +# exports +__all__: typing.Sequence[str] = ( + 'Preview', + ) + + +## code ## + +class Preview(base.Extractor): + """Extract previews.""" + + CONTENT_READER = 'bsie.reader.preview.Preview' + + def __init__(self, max_sides: typing.Iterable[int]): + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + ''' + + + + bsn:Preview rdfs:subClassOf bsfs:Node . + bsl:BinaryBlob rdfs:subClassOf bsfs:Literal . + <https://schema.bsfs.io/ie/Literal/BinaryBlob/JPEG> rdfs:subClassOf bsl:BinaryBlob . + + bse:preview rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range bsn:Preview ; + bsfs:unique "false"^^xsd:boolean . + + bsp:width rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Preview ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + + bsp:height rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Preview ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + + bsp:asset rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Preview ; + rdfs:range <https://schema.bsfs.io/ie/Literal/BinaryBlob/JPEG> ; + bsfs:unique "true"^^xsd:boolean . + + ''')) + # initialize extra args + self.max_sides = set(max_sides) + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) \ + and self.max_sides == other.max_sides + + def __hash__(self) -> int: + return hash((super().__hash__(), tuple(sorted(self.max_sides)))) + + def extract( + self, + subject: node.Node, + content: typing.Callable[[int], PIL.Image.Image], + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + # check principals + if self.schema.predicate(ns.bse.preview) not in principals: + return + + for max_side in self.max_sides: + # get the preview in the right resolution + img = content(max_side) + # convert the preview to jpeg + buffer = io.BytesIO() + img.save(buffer, format='jpeg') + # create a preview node + preview = node.Node(ns.bsn.Preview, + ucid=bsfs.uuid.UCID.from_bytes(buffer.getvalue()), + size=max_side, + source=subject, + ) + # yield triples + yield subject, self.schema.predicate(ns.bse.preview), preview + yield preview, self.schema.predicate(ns.bsp.width), img.width + yield preview, self.schema.predicate(ns.bsp.height), img.height + yield preview, self.schema.predicate(ns.bsp.asset), buffer.getvalue() + +## EOF ## diff --git a/bsie/lib/__init__.py b/bsie/lib/__init__.py index 578c2c4..f44fb74 100644 --- a/bsie/lib/__init__.py +++ b/bsie/lib/__init__.py @@ -1,18 +1,16 @@ -""" -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports +# standard imports import typing # inner-module imports from .bsie import BSIE +from .builder import PipelineBuilder +from .naming_policy import DefaultNamingPolicy # exports __all__: typing.Sequence[str] = ( 'BSIE', + 'PipelineBuilder', ) ## EOF ## diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py index e087fa9..b02e707 100644 --- a/bsie/lib/bsie.py +++ b/bsie/lib/bsie.py @@ -1,16 +1,14 @@ -""" -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports +# standard imports import typing # bsie imports -from bsie.tools import Pipeline from bsie.utils import bsfs, node, ns +# inner-module imports +from .naming_policy import NamingPolicy +from .pipeline import Pipeline + # exports __all__: typing.Sequence[str] = ( 'BSIE', @@ -39,15 +37,18 @@ class BSIE(): def __init__( self, - # pipeline builder. + # pipeline. pipeline: Pipeline, + # naming policy + naming_policy: NamingPolicy, # principals to extract at most. None implies all available w.r.t. extractors. collect: typing.Optional[typing.Iterable[bsfs.URI]] = None, # principals to discard. discard: typing.Optional[typing.Iterable[bsfs.URI]] = None, ): - # store pipeline + # store pipeline and naming policy self._pipeline = pipeline + self._naming_policy = naming_policy # start off with available principals self._principals = {pred.uri for pred in self._pipeline.principals} # limit principals to specified ones by argument. @@ -87,6 +88,6 @@ class BSIE(): # predicate lookup principals = {self.schema.predicate(pred) for pred in principals} # invoke pipeline - yield from self._pipeline(path, principals) + yield from self._naming_policy(self._pipeline(path, principals)) ## EOF ## diff --git a/bsie/lib/builder.py b/bsie/lib/builder.py new file mode 100644 index 0000000..3a15311 --- /dev/null +++ b/bsie/lib/builder.py @@ -0,0 +1,75 @@ + +# standard imports +import logging +import typing + +# bsie imports +from bsie.extractor import ExtractorBuilder +from bsie.reader import ReaderBuilder +from bsie.utils import errors + +# inner-module imports +from . import pipeline + +# exports +__all__: typing.Sequence[str] = ( + 'PipelineBuilder', + ) + + +## code ## + +logger = logging.getLogger(__name__) + +class PipelineBuilder(): + """Build `bsie.tools.pipeline.Pipeline` instances.""" + + # builder for Readers. + rbuild: ReaderBuilder + + # builder for Extractors. + ebuild: ExtractorBuilder + + def __init__( + self, + reader_builder: ReaderBuilder, + extractor_builder: ExtractorBuilder, + ): + self.rbuild = reader_builder + self.ebuild = extractor_builder + + def build(self) -> pipeline.Pipeline: + """Return a Pipeline instance.""" + ext2rdr = {} + + for eidx in self.ebuild: + # build extractor + try: + ext = self.ebuild.build(eidx) + + except errors.LoaderError as err: # failed to load extractor; skip + logger.error('failed to load extractor: %s', err) + continue + + except errors.BuilderError as err: # failed to build instance; skip + logger.error(str(err)) + continue + + try: + # get reader required by extractor + if ext.CONTENT_READER is not None: + rdr = self.rbuild.build(ext.CONTENT_READER) + else: + rdr = None + # store extractor + ext2rdr[ext] = rdr + + except errors.LoaderError as err: # failed to load reader + logger.error('failed to load reader: %s', err) + + except errors.BuilderError as err: # failed to build reader + logger.error(str(err)) + + return pipeline.Pipeline(ext2rdr) + +## EOF ## diff --git a/bsie/lib/naming_policy.py b/bsie/lib/naming_policy.py new file mode 100644 index 0000000..9b9a45d --- /dev/null +++ b/bsie/lib/naming_policy.py @@ -0,0 +1,115 @@ + +# standard imports +import abc +import os +import typing + +# bsie imports +from bsie.utils import bsfs, errors, ns +from bsie.utils.node import Node + +# exports +__all__: typing.Sequence[str] = ( + 'DefaultNamingPolicy', + ) + + +## code ## + +class NamingPolicy(): + """Determine node uri's from node hints.""" + def __call__( + self, + iterable: typing.Iterable[typing.Tuple[Node, bsfs.URI, typing.Any]], + ): + """Apply the policy on a triple iterator.""" + return NamingPolicyIterator(self, iterable) + + @abc.abstractmethod + def handle_node(self, node: Node) -> Node: + """Apply the policy on a node.""" + + +class NamingPolicyIterator(): + """Iterates over triples, determines uris according to a *policy* as it goes.""" + + # source triple iterator. + _iterable: typing.Iterable[typing.Tuple[Node, bsfs.URI, typing.Any]] + + # naming policy + _policy: NamingPolicy + + def __init__( + self, + policy: NamingPolicy, + iterable: typing.Iterable[typing.Tuple[Node, bsfs.URI, typing.Any]], + ): + self._iterable = iterable + self._policy = policy + + def __iter__(self): + for node, pred, value in self._iterable: + # handle subject + self._policy.handle_node(node) + # handle value + if isinstance(value, Node): + self._policy.handle_node(value) + # yield triple + yield node, pred, value + + +class DefaultNamingPolicy(NamingPolicy): + """Compose URIs as <host/user/node_type#fragment> + + What information is used as fragment depends on the node type. + Typically, the default is to use the "ucid" hint. + The fallback in all cases is to generate a random uuid. + + Never changes previously assigned uris. Sets uris in-place. + + """ + + def __init__( + self, + host: bsfs.URI, + user: str, + ): + self._prefix = bsfs.Namespace(os.path.join(host, user)) + self._uuid = bsfs.uuid.UUID() + + def handle_node(self, node: Node) -> Node: + if node.uri is not None: + return node + if node.node_type == ns.bsn.Entity : + return self.name_file(node) + if node.node_type == ns.bsn.Preview: + return self.name_preview(node) + raise errors.ProgrammingError('no naming policy available for {node.node_type}') + + def name_file(self, node: Node) -> Node: + """Set a bsfs:File node's uri fragment to its ucid.""" + if 'ucid' in node.hints: # content id + fragment = node.hints['ucid'] + else: # random name + fragment = self._uuid() + node.uri = getattr(self._prefix.file(), fragment) + return node + + def name_preview(self, node: Node) -> Node: + """Set a bsfs:Preview node's uri fragment to its ucid. + Uses its source fragment as fallback. Appends the size if provided. + """ + fragment = None + if 'ucid' in node.hints: # content id + fragment = node.hints['ucid'] + if fragment is None and 'source' in node.hints: # source id + self.handle_node(node.hints['source']) + fragment = node.hints['source'].uri.get('fragment', None) + if fragment is None: # random name + fragment = self._uuid() + if 'size' in node.hints: # append size + fragment += '_s' + str(node.hints['size']) + node.uri = getattr(self._prefix.preview(), fragment) + return node + +## EOF ## diff --git a/bsie/tools/pipeline.py b/bsie/lib/pipeline.py index 20e8ddf..30fd6fd 100644 --- a/bsie/tools/pipeline.py +++ b/bsie/lib/pipeline.py @@ -1,25 +1,19 @@ -""" -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports +# standard imports from collections import defaultdict import logging import typing # bsie imports -from bsie import base -from bsie.utils import bsfs, node, ns +from bsie.extractor import Extractor +from bsie.reader import Reader +from bsie.utils import bsfs, errors, node, ns # exports __all__: typing.Sequence[str] = ( 'Pipeline', ) -# constants -FILE_PREFIX = 'file#' ## code ## @@ -39,19 +33,14 @@ class Pipeline(): # combined extractor schemas. _schema: bsfs.schema.Schema - # node prefix. - _prefix: bsfs.Namespace - # extractor -> reader mapping - _ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]] + _ext2rdr: typing.Dict[Extractor, typing.Optional[Reader]] def __init__( self, - prefix: bsfs.Namespace, - ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]] + ext2rdr: typing.Dict[Extractor, typing.Optional[Reader]] ): # store core members - self._prefix = prefix + FILE_PREFIX self._ext2rdr = ext2rdr # compile schema from all extractors self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr) @@ -63,12 +52,11 @@ class Pipeline(): return f'{bsfs.typename(self)}(...)' def __hash__(self) -> int: - return hash((type(self), self._prefix, self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values()))) + return hash((type(self), self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values()))) def __eq__(self, other: typing.Any) -> bool: return isinstance(other, type(self)) \ and self._schema == other._schema \ - and self._prefix == other._prefix \ and self._ext2rdr == other._ext2rdr @property @@ -116,27 +104,33 @@ class Pipeline(): rdr2ext[rdr].add(ext) # create subject for file - uuid = bsfs.uuid.UCID.from_path(path) - subject = node.Node(ns.bsfs.File, self._prefix[uuid]) + subject = node.Node(ns.bsn.Entity, + ucid=bsfs.uuid.UCID.from_path(path), + ) # extract information for rdr, extrs in rdr2ext.items(): try: # get content content = rdr(path) if rdr is not None else None + #logger.info('extracted %s from %s', rdr, path) # apply extractors on this content for ext in extrs: try: # get predicate/value tuples - for subject, pred, value in ext.extract(subject, content, principals): - yield subject, pred, value + yield from ext.extract(subject, content, principals) - except base.errors.ExtractorError as err: + except errors.ExtractorError as err: # critical extractor failure. logger.error('%s failed to extract triples from content: %s', ext, err) - except base.errors.ReaderError as err: + except errors.UnsupportedFileFormatError: + # failed to read the file format. skip. + #logger.warning('%s could not process the file format of %s', rdr, err) + pass + + except errors.ReaderError as err: # failed to read any content. skip. logger.error('%s failed to read content: %s', rdr, err) diff --git a/bsie/reader/__init__.py b/bsie/reader/__init__.py index a45f22b..a1c38a9 100644 --- a/bsie/reader/__init__.py +++ b/bsie/reader/__init__.py @@ -1,8 +1,8 @@ """The Reader classes return high-level content structures from files. The Reader fulfills two purposes: - First, it brokers between multiple libraries and file formats. - Second, it separates multiple aspects of a file into distinct content types. +First, it brokers between multiple libraries and file formats. +Second, it separates multiple aspects of a file into distinct content types. Often, different libraries focus on reading different types of content from a file. E.g. one would use different modules to read file system infos than to @@ -11,9 +11,18 @@ type. Each distinct type can be implemented in a file or submodule that provides a Reader implementation. Through utilization of submodules, different file formats can be supported. -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 """ +# standard imports +import typing + +# inner-module imports +from .base import Reader +from .builder import ReaderBuilder + +# exports +__all__: typing.Sequence[str] = ( + 'Reader', + 'ReaderBuilder', + ) ## EOF ## diff --git a/bsie/base/reader.py b/bsie/reader/base.py index cbabd36..a775701 100644 --- a/bsie/base/reader.py +++ b/bsie/reader/base.py @@ -1,14 +1,5 @@ -"""The Reader classes return high-level content structures from files. -The Reader fulfills two purposes: - First, it brokers between multiple libraries and file formats. - Second, it separates multiple aspects of a file into distinct content types. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports +# standard imports import abc import typing @@ -39,7 +30,7 @@ class Reader(abc.ABC): return hash(type(self)) @abc.abstractmethod - def __call__(self, path: bsfs.URI) -> typing.Any: + def __call__(self, path: str) -> typing.Any: """Return some content of the file at *path*. Raises a `ReaderError` if the reader cannot make sense of the file format. """ diff --git a/bsie/reader/builder.py b/bsie/reader/builder.py new file mode 100644 index 0000000..d32700b --- /dev/null +++ b/bsie/reader/builder.py @@ -0,0 +1,73 @@ + +# standard imports +import typing + +# bsie imports +from bsie.utils import bsfs, errors, safe_load, unpack_qualified_name + +# inner-module imports +from . import base + +# exports +__all__: typing.Sequence[str] = ( + 'ReaderBuilder', + ) + + +## code ## + +class ReaderBuilder(): + """Build `bsie.base.Reader` instances. + + Readers are defined via their qualified class name + (e.g., bsie.reader.path.Path) and optional keyword + arguments that are passed to the constructor via + the *kwargs* argument (name as key, kwargs as value). + The ReaderBuilder keeps a cache of previously built + reader instances, as they are anyway built with + identical keyword arguments. + + """ + + # keyword arguments + _kwargs: typing.Dict[str, typing.Dict[str, typing.Any]] + + # cached readers + _cache: typing.Dict[str, base.Reader] + + def __init__( + self, + kwargs: typing.Optional[typing.Dict[str, typing.Dict[str, typing.Any]]] = None): + if kwargs is None: + kwargs = {} + self._kwargs = kwargs + self._cache = {} + + def build(self, name: str) -> base.Reader: + """Return an instance for the qualified class name.""" + # return cached instance + if name in self._cache: + return self._cache[name] + + # check name and get module/class components + module_name, class_name = unpack_qualified_name(name) + + # import reader class + cls = safe_load(module_name, class_name) + + # get kwargs + kwargs = self._kwargs.get(name, {}) + if not isinstance(kwargs, dict): + raise TypeError(f'expected a kwargs dict, found {bsfs.typename(kwargs)}') + + try: # build, cache, and return instance + obj = cls(**kwargs) + # cache instance + self._cache[name] = obj + # return instance + return obj + + except Exception as err: + raise errors.BuilderError(f'failed to build reader {name} due to {bsfs.typename(err)}: {err}') from err + +## EOF ## diff --git a/bsie/reader/chain.py b/bsie/reader/chain.py new file mode 100644 index 0000000..79b44b4 --- /dev/null +++ b/bsie/reader/chain.py @@ -0,0 +1,86 @@ + +# standard imports +import logging +import typing + +# bsie imports +from bsie.utils import bsfs, errors + +# inner-module imports +from . import base +from . import builder + +# exports +__all__: typing.Sequence[str] = ( + 'ReaderChain', + ) + + +## code ## + +logger = logging.getLogger(__name__) + +# Content type. +T_CONTENT = typing.TypeVar('T_CONTENT') # pylint: disable=invalid-name + +class ReaderChain(base.Reader, typing.Generic[T_CONTENT]): + """Read an image.""" + + # sub-readers for specific file formats. + _children: typing.Tuple[base.Reader, ...] + + def __init__( + self, + subreader_names: typing.Iterable[str], + cfg: typing.Optional[typing.Any] = None, + ): + rbuild = builder.ReaderBuilder(cfg) + children = [] + for name in subreader_names: + try: + # build sub-reader + children.append(rbuild.build(name)) + except (ValueError, + TypeError, + errors.LoaderError, + errors.BuilderError) as err: + # failed to build a child; skip and notify + logger.warning('failed to load reader: %s', err) + + if len(children) == 0: + logger.warning('%s failed to load any sub-readers.', bsfs.typename(self)) + + # copy children to member + self._children = tuple(children) + + def __str__(self) -> str: + substr = ', '.join(str(child) for child in self._children) + return f'{bsfs.typename(self)}({substr})' + + def __repr__(self) -> str: + return f'{bsfs.typename(self)}({self._children})' + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) \ + and self._children == other._children + + def __hash__(self) -> int: + return hash((super().__hash__(), self._children)) + + def __call__(self, path: str) -> T_CONTENT: + raise_error = False + for child in self._children: + try: + return child(path) + except errors.UnsupportedFileFormatError: + # child cannot read the file, skip. + pass + except errors.ReaderError: + # child failed to read the file, skip. + raise_error = True + + if raise_error: + raise errors.ReaderError(path) + raise errors.UnsupportedFileFormatError(path) + +## EOF ## diff --git a/bsie/reader/exif.py b/bsie/reader/exif.py new file mode 100644 index 0000000..2d0428b --- /dev/null +++ b/bsie/reader/exif.py @@ -0,0 +1,44 @@ + +# standard imports +import typing + +# external imports +import pyexiv2 + +# bsie imports +from bsie.utils import errors, filematcher + +# inner-module imports +from . import base + +# constants +MATCH_RULE = 'mime=image/jpeg' + +# exports +__all__: typing.Sequence[str] = ( + 'Exif', + ) + + +## code ## + +class Exif(base.Reader): + """Use pyexiv2 to read exif metadata from image files.""" + + def __init__(self): + self._match = filematcher.parse(MATCH_RULE) + + def __call__(self, path: str) -> dict: + # perform quick checks first + if not self._match(path): + raise errors.UnsupportedFileFormatError(path) + + try: + # open the file + img = pyexiv2.Image(path) + # read metadata + return img.read_exif() + except (TypeError, OSError, RuntimeError) as err: + raise errors.ReaderError(path) from err + +## EOF ## diff --git a/bsie/reader/image/__init__.py b/bsie/reader/image/__init__.py new file mode 100644 index 0000000..89642f2 --- /dev/null +++ b/bsie/reader/image/__init__.py @@ -0,0 +1,31 @@ + +# standard imports +import typing + +# external imports +import PIL.Image + +# inner-module imports +from .. import chain + +# constants +_FILE_FORMAT_READERS: typing.Sequence[str] = ( + __package__ + '._raw.RawImage', + __package__ + '._pillow.PillowImage', + ) + +# exports +__all__: typing.Sequence[str] = ( + 'Image', + ) + + +## code ## + +class Image(chain.ReaderChain[PIL.Image.Image]): # pylint: disable=too-few-public-methods + """Read an image file.""" + + def __init__(self, cfg: typing.Optional[typing.Any] = None): + super().__init__(_FILE_FORMAT_READERS, cfg) + +## EOF ## diff --git a/bsie/reader/image/_pillow.py b/bsie/reader/image/_pillow.py new file mode 100644 index 0000000..0611d3c --- /dev/null +++ b/bsie/reader/image/_pillow.py @@ -0,0 +1,34 @@ + +# standard imports +import typing + +# external imports +import PIL.Image + +# bsie imports +from bsie.utils import errors + +# inner-module imports +from .. import base + +# exports +__all__: typing.Sequence[str] = ( + 'PillowImage', + ) + + +## code ## + +class PillowImage(base.Reader): + """Use PIL to read content of a variety of image file types.""" + + def __call__(self, path: str) -> PIL.Image.Image: + try: + # open file with PIL + return PIL.Image.open(path) + except PIL.UnidentifiedImageError as err: + raise errors.UnsupportedFileFormatError(path) from err + except IOError as err: + raise errors.ReaderError(path) from err + +# EOF ## diff --git a/bsie/reader/image/_raw.py b/bsie/reader/image/_raw.py new file mode 100644 index 0000000..e5745aa --- /dev/null +++ b/bsie/reader/image/_raw.py @@ -0,0 +1,56 @@ + +# standard imports +import typing + +# external imports +import PIL.Image +import rawpy + +# bsie imports +from bsie.utils import errors, filematcher + +# inner-module imports +from .. import base + +# constants +MATCH_RULE = 'mime={image/x-nikon-nef} | extension={nef}' + +# exports +__all__: typing.Sequence[str] = ( + 'RawImage', + ) + + +## code ## + +class RawImage(base.Reader): + """Use rawpy to read content of raw image file types.""" + + # file matcher + _match: filematcher.Matcher + + # additional kwargs to rawpy's postprocess + _rawpy_kwargs: typing.Dict[str, typing.Any] + + def __init__(self, **rawpy_kwargs): + match_rule = rawpy_kwargs.pop('file_match_rule', MATCH_RULE) + self._match = filematcher.parse(match_rule) + self._rawpy_kwargs = rawpy_kwargs + + def __call__(self, path: str) -> PIL.Image.Image: + # perform quick checks first + if not self._match(path): + raise errors.UnsupportedFileFormatError(path) + + try: + # open file with rawpy + ary = rawpy.imread(path).postprocess(**self._rawpy_kwargs) + # convert to PIL.Image + return PIL.Image.fromarray(ary) + except (rawpy.LibRawFatalError, # pylint: disable=no-member # pylint doesn't find the errors + rawpy.NotSupportedError, # pylint: disable=no-member + rawpy.LibRawNonFatalError, # pylint: disable=no-member + ) as err: + raise errors.ReaderError(path) from err + +## EOF ## diff --git a/bsie/reader/path.py b/bsie/reader/path.py index d60f187..45eb127 100644 --- a/bsie/reader/path.py +++ b/bsie/reader/path.py @@ -1,14 +1,10 @@ """The Path reader produces a file path. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing -# bsie imports -from bsie.base import reader +# inner-module imports +from . import base # exports __all__: typing.Sequence[str] = ( @@ -18,7 +14,7 @@ __all__: typing.Sequence[str] = ( ## code ## -class Path(reader.Reader): +class Path(base.Reader): """Return the path.""" def __call__(self, path: str) -> str: diff --git a/bsie/reader/preview/__init__.py b/bsie/reader/preview/__init__.py new file mode 100644 index 0000000..791a133 --- /dev/null +++ b/bsie/reader/preview/__init__.py @@ -0,0 +1,34 @@ + +# imports +import typing + +# external imports +import PIL.Image + +# inner-module imports +from .. import chain + +# constants +_FILE_FORMAT_READERS: typing.Sequence[str] = ( + # native image formats + __package__ + '._pillow.PillowPreviewReader', + __package__ + '._rawpy.RawpyPreviewReader', + # multiformat readers + __package__ + '._pg.PreviewGeneratorReader', + ) + +# exports +__all__: typing.Sequence[str] = ( + 'Preview', + ) + + +## code ## + +class Preview(chain.ReaderChain[typing.Callable[[int], PIL.Image.Image]]): # pylint: disable=too-few-public-methods + """Create a preview from a file.""" + + def __init__(self, cfg: typing.Optional[typing.Any] = None): + super().__init__(_FILE_FORMAT_READERS, cfg) + +## EOF ## diff --git a/bsie/reader/preview/_pg.py b/bsie/reader/preview/_pg.py new file mode 100644 index 0000000..401b33d --- /dev/null +++ b/bsie/reader/preview/_pg.py @@ -0,0 +1,81 @@ + +# standard imports +from functools import partial +import contextlib +import io +import os +import shutil +import tempfile +import typing + +# external imports +from preview_generator.manager import PreviewManager +import PIL.Image + +# bsie imports +from bsie.utils import errors + +# inner-module imports +from .. import base + +# exports +__all__: typing.Sequence[str] = ( + 'PreviewGeneratorReader', + ) + + +## code ## + +class PreviewGeneratorReader(base.Reader): + """Uses preview_generator to create previews for various data formats. + See `https://github.com/algoo/preview-generator`_ for details. + """ + + # PreviewManager instance. + _mngr: PreviewManager + + # Set of mime types supported by PreviewManager. + _supported_mimetypes: typing.Set[str] + + # PreviewManager cache. + _cache: str + + # Determines whether the cache directory should be deleted after use. + _cleanup: bool + + def __init__(self, cache: typing.Optional[str] = None): + # initialize cache directory + # TODO: initialize in memory, e.g., via PyFilesystem + if cache is None: + self._cache = tempfile.mkdtemp(prefix='bsie-preview-cache-') + self._cleanup = True + else: + self._cache = cache + self._cleanup = False + # create preview generator + with contextlib.redirect_stderr(io.StringIO()): + self._mngr = PreviewManager(self._cache, create_folder=True) + self._supported_mimetypes = set(self._mngr.get_supported_mimetypes()) + + def __del__(self): + if self._cleanup: + shutil.rmtree(self._cache, ignore_errors=True) + + def __call__(self, path: str) -> typing.Callable[[int], PIL.Image.Image]: + if not os.path.exists(path): + raise errors.ReaderError(path) + if self._mngr.get_mimetype(path) not in self._supported_mimetypes: + raise errors.UnsupportedFileFormatError(path) + return partial(self._preview_callback, path) + + def _preview_callback(self, path: str, max_side: int) -> PIL.Image.Image: + """Produce a jpeg preview of *path* with at most *max_side* side length.""" + try: + # generate the preview + preview_path = self._mngr.get_jpeg_preview(path, width=max_side, height=max_side) + # open the preview and return + return PIL.Image.open(preview_path) + except Exception as err: # FIXME: less generic exception! + raise errors.ReaderError(path) from err + +## EOF ## diff --git a/bsie/reader/preview/_pillow.py b/bsie/reader/preview/_pillow.py new file mode 100644 index 0000000..2b797c6 --- /dev/null +++ b/bsie/reader/preview/_pillow.py @@ -0,0 +1,39 @@ + +# standard imports +from functools import partial +import typing + +# external imports +import PIL.Image + +# bsie imports +from bsie.utils import errors + +# inner-module imports +from . import utils +from .. import base + +# exports +__all__: typing.Sequence[str] = ( + 'PillowPreviewReader', + ) + + +## code ## + +class PillowPreviewReader(base.Reader): + """Produce previews for image files using the Pillow library.""" + + def __call__(self, path: str) -> typing.Callable[[int], PIL.Image.Image]: + try: + # open file with PIL + img = PIL.Image.open(path) + # return callback + return partial(utils.resize, img) + except PIL.UnidentifiedImageError as err: + # failed to open, skip file + raise errors.UnsupportedFileFormatError(path) from err + except OSError as err: + raise errors.ReaderError(path) from err + +# EOF ## diff --git a/bsie/reader/preview/_rawpy.py b/bsie/reader/preview/_rawpy.py new file mode 100644 index 0000000..16e8675 --- /dev/null +++ b/bsie/reader/preview/_rawpy.py @@ -0,0 +1,61 @@ + +# standard imports +from functools import partial +import typing + +# external imports +import PIL.Image +import rawpy + +# bsie imports +from bsie.utils import errors, filematcher + +# inner-module imports +from . import utils +from .. import base + +# constants +MATCH_RULE = 'mime={image/x-nikon-nef} | extension={nef}' + +# exports +__all__: typing.Sequence[str] = ( + 'RawpyPreviewReader', + ) + + +## code ## + +class RawpyPreviewReader(base.Reader): + """Produce previews for raw image files using the rawpy library.""" + + # file matcher + _match: filematcher.Matcher + + # additional kwargs to rawpy's postprocess + _rawpy_kwargs: typing.Dict[str, typing.Any] + + def __init__(self, **rawpy_kwargs): + match_rule = rawpy_kwargs.pop('file_match_rule', MATCH_RULE) + self._match = filematcher.parse(match_rule) + self._rawpy_kwargs = rawpy_kwargs + + def __call__(self, path: str) -> typing.Callable[[int], PIL.Image.Image]: + # perform quick checks first + if not self._match(path): + raise errors.UnsupportedFileFormatError(path) + + try: + # open file with rawpy + ary = rawpy.imread(path).postprocess(**self._rawpy_kwargs) + # convert to PIL.Image + img = PIL.Image.fromarray(ary) + # return callback + return partial(utils.resize, img) + + except (rawpy.LibRawFatalError, # pylint: disable=no-member # pylint doesn't find the errors + rawpy.NotSupportedError, # pylint: disable=no-member + rawpy.LibRawNonFatalError, # pylint: disable=no-member + ) as err: + raise errors.ReaderError(path) from err + +## EOF ## diff --git a/bsie/reader/preview/utils.py b/bsie/reader/preview/utils.py new file mode 100644 index 0000000..82ecc31 --- /dev/null +++ b/bsie/reader/preview/utils.py @@ -0,0 +1,34 @@ + +# standard imports +import typing + +# external imports +import PIL.Image + +# exports +__all__: typing.Sequence[str] = ( + 'resize', + ) + + +## code ## + +def resize( + img: PIL.Image.Image, + max_size: int, + ) -> PIL.Image.Image: + """Resize an image to a given maximum side length.""" + # determine target dimensions + ratio = img.width / img.height + if img.width > img.height: + width, height = max_size, round(max_size / ratio) + else: + width, height = round(ratio * max_size), max_size + # rescale and return + return img.resize( + (width, height), + resample=PIL.Image.Resampling.LANCZOS, # create high-quality image + reducing_gap=3.0, # optimize computation via fast size reduction + ) + +## EOF ## diff --git a/bsie/reader/stat.py b/bsie/reader/stat.py index fc5fb24..f42e7fb 100644 --- a/bsie/reader/stat.py +++ b/bsie/reader/stat.py @@ -1,15 +1,14 @@ """The Stat reader produces filesystem stat information. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import os import typing # bsie imports -from bsie.base import errors, reader +from bsie.utils import errors + +# inner-module imports +from . import base # exports __all__: typing.Sequence[str] = ( @@ -19,7 +18,7 @@ __all__: typing.Sequence[str] = ( ## code ## -class Stat(reader.Reader): +class Stat(base.Reader): """Read and return the filesystem's stat infos.""" def __call__(self, path: str) -> os.stat_result: diff --git a/bsie/tools/__init__.py b/bsie/tools/__init__.py deleted file mode 100644 index 803c321..0000000 --- a/bsie/tools/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -""" - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports -import typing - -# inner-module imports -from . import builder -from .pipeline import Pipeline - -# exports -__all__: typing.Sequence[str] = ( - 'builder', - 'Pipeline', - ) - -## EOF ## diff --git a/bsie/tools/builder.py b/bsie/tools/builder.py deleted file mode 100644 index 190d9bf..0000000 --- a/bsie/tools/builder.py +++ /dev/null @@ -1,226 +0,0 @@ -""" - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports -import importlib -import logging -import typing - -# bsie imports -from bsie import base -from bsie.base import errors -from bsie.utils import bsfs - -# inner-module imports -from . import pipeline - -# exports -__all__: typing.Sequence[str] = ( - 'ExtractorBuilder', - 'PipelineBuilder', - 'ReaderBuilder', - ) - - -## code ## - -logger = logging.getLogger(__name__) - -def _safe_load(module_name: str, class_name: str): - """Get a class from a module. Raise BuilderError if anything goes wrong.""" - try: - # load the module - module = importlib.import_module(module_name) - except Exception as err: - # cannot import module - raise errors.LoaderError(f'cannot load module {module_name}') from err - - try: - # get the class from the module - cls = getattr(module, class_name) - except Exception as err: - # cannot find the class - raise errors.LoaderError(f'cannot load class {class_name} from module {module_name}') from err - - return cls - - -def _unpack_name(name): - """Split a name into its module and class component (dot-separated).""" - if not isinstance(name, str): - raise TypeError(name) - if '.' not in name: - raise ValueError('name must be a qualified class name.') - module_name, class_name = name[:name.rfind('.')], name[name.rfind('.')+1:] - if module_name == '': - raise ValueError('name must be a qualified class name.') - return module_name, class_name - - -class ReaderBuilder(): - """Build `bsie.base.Reader` instances. - - Readers are defined via their qualified class name - (e.g., bsie.reader.path.Path) and optional keyword - arguments that are passed to the constructor via - the *kwargs* argument (name as key, kwargs as value). - The ReaderBuilder keeps a cache of previously built - reader instances, as they are anyway built with - identical keyword arguments. - - """ - - # keyword arguments - _kwargs: typing.Dict[str, typing.Dict[str, typing.Any]] - - # cached readers - _cache: typing.Dict[str, base.Reader] - - def __init__(self, kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]): - self._kwargs = kwargs - self._cache = {} - - def build(self, name: str) -> base.Reader: - """Return an instance for the qualified class name.""" - # return cached instance - if name in self._cache: - return self._cache[name] - - # check name and get module/class components - module_name, class_name = _unpack_name(name) - - # import reader class - cls = _safe_load(module_name, class_name) - - # get kwargs - kwargs = self._kwargs.get(name, {}) - if not isinstance(kwargs, dict): - raise TypeError(f'expected a kwargs dict, found {bsfs.typename(kwargs)}') - - try: # build, cache, and return instance - obj = cls(**kwargs) - # cache instance - self._cache[name] = obj - # return instance - return obj - - except Exception as err: - raise errors.BuilderError(f'failed to build reader {name} due to {bsfs.typename(err)}: {err}') from err - - -class ExtractorBuilder(): - """Build `bsie.base.Extractor instances. - - It is permissible to build multiple instances of the same extractor - (typically with different arguments), hence the ExtractorBuilder - receives a list of build specifications. Each specification is - a dict with a single key (extractor's qualified name) and a dict - to be used as keyword arguments. - Example: [{'bsie.extractor.generic.path.Path': {}}, ] - - """ - - # build specifications - _specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]] - - def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]): - self._specs = specs - - def __iter__(self) -> typing.Iterator[int]: - """Iterate over extractor specifications.""" - return iter(range(len(self._specs))) - - def build(self, index: int) -> base.Extractor: - """Return an instance of the n'th extractor (n=*index*).""" - # get build instructions - specs = self._specs[index] - - # check specs structure. expecting[{name: {kwargs}}] - if not isinstance(specs, dict): - raise TypeError(f'expected a dict, found {bsfs.typename(specs)}') - if len(specs) != 1: - raise TypeError(f'expected a dict of length one, found {len(specs)}') - - # get name and args from specs - name = next(iter(specs.keys())) - kwargs = specs[name] - - # check kwargs structure - if not isinstance(kwargs, dict): - raise TypeError(f'expected a dict, found {bsfs.typename(kwargs)}') - - # check name and get module/class components - module_name, class_name = _unpack_name(name) - - # import extractor class - cls = _safe_load(module_name, class_name) - - try: # build and return instance - return cls(**kwargs) - - except Exception as err: - raise errors.BuilderError(f'failed to build extractor {name} due to {bsfs.typename(err)}: {err}') from err - - -class PipelineBuilder(): - """Build `bsie.tools.pipeline.Pipeline` instances.""" - - # Prefix to be used in the Pipeline. - prefix: bsfs.Namespace - - # builder for Readers. - rbuild: ReaderBuilder - - # builder for Extractors. - ebuild: ExtractorBuilder - - def __init__( - self, - prefix: bsfs.Namespace, - reader_builder: ReaderBuilder, - extractor_builder: ExtractorBuilder, - ): - self.prefix = prefix - self.rbuild = reader_builder - self.ebuild = extractor_builder - - def build(self) -> pipeline.Pipeline: - """Return a Pipeline instance.""" - ext2rdr = {} - - for eidx in self.ebuild: - # build extractor - try: - ext = self.ebuild.build(eidx) - - except errors.LoaderError as err: # failed to load extractor; skip - logger.error('failed to load extractor: %s', err) - continue - - except errors.BuilderError as err: # failed to build instance; skip - logger.error(str(err)) - continue - - try: - # get reader required by extractor - if ext.CONTENT_READER is not None: - rdr = self.rbuild.build(ext.CONTENT_READER) - else: - rdr = None - # store extractor - ext2rdr[ext] = rdr - - except errors.LoaderError as err: # failed to load reader - logger.error('failed to load reader: %s', err) - - except errors.BuilderError as err: # failed to build reader - logger.error(str(err)) - - return pipeline.Pipeline(self.prefix, ext2rdr) - - - -## EOF ## diff --git a/bsie/utils/__init__.py b/bsie/utils/__init__.py index bd22236..18c8db7 100644 --- a/bsie/utils/__init__.py +++ b/bsie/utils/__init__.py @@ -1,22 +1,23 @@ """Common tools and definitions. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # inner-module imports from . import bsfs +from . import filematcher from . import namespaces as ns from . import node +from .loading import safe_load, unpack_qualified_name # exports __all__: typing.Sequence[str] = ( 'bsfs', + 'filematcher', 'node', 'ns', + 'safe_load', + 'unpack_qualified_name', ) ## EOF ## diff --git a/bsie/utils/bsfs.py b/bsie/utils/bsfs.py index 0b88479..fc045cc 100644 --- a/bsie/utils/bsfs.py +++ b/bsie/utils/bsfs.py @@ -1,10 +1,6 @@ """BSFS bridge, provides BSFS bindings for BSIE. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # bsfs imports diff --git a/bsie/base/errors.py b/bsie/utils/errors.py index dc3c30e..7c7e6ed 100644 --- a/bsie/base/errors.py +++ b/bsie/utils/errors.py @@ -1,10 +1,6 @@ """Common BSIE exceptions. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # exports @@ -39,4 +35,10 @@ class ProgrammingError(_BSIEError): class UnreachableError(ProgrammingError): """Bravo, you've reached a point in code that should logically not be reachable.""" +class ParserError(_BSIEError): + """Failed to parse due to invalid syntax or structures.""" + +class UnsupportedFileFormatError(_BSIEError): + """Failed to read a file format.""" + ## EOF ## diff --git a/bsie/utils/filematcher/__init__.py b/bsie/utils/filematcher/__init__.py new file mode 100644 index 0000000..908de78 --- /dev/null +++ b/bsie/utils/filematcher/__init__.py @@ -0,0 +1,15 @@ + +# standard imports +import typing + +# inner-module imports +from .matcher import Matcher +from .parser import parse + +# exports +__all__: typing.Sequence[str] = ( + 'Matcher', + 'parse', + ) + +## EOF ## diff --git a/bsie/utils/filematcher/matcher.py b/bsie/utils/filematcher/matcher.py new file mode 100644 index 0000000..1fa308e --- /dev/null +++ b/bsie/utils/filematcher/matcher.py @@ -0,0 +1,174 @@ + +# standard imports +from collections.abc import Callable, Collection, Hashable +import abc +import os +import typing + +# external imports +import magic + +# exports +__all__: typing.Sequence[str] = [] + + +## code ## + +# abstract nodes + +class Matcher(abc.ABC, Hashable, Callable, Collection): # type: ignore [misc] # Invalid base class Callable + """Matcher node base class.""" + + # child expressions or terminals + _childs: typing.Set[typing.Any] + + def __init__(self, *childs: typing.Any): + if len(childs) == 1 and isinstance(childs[0], (list, tuple, set)): + self._childs = set(childs[0]) + else: + self._childs = set(childs) + + def __contains__(self, needle: typing.Any) -> bool: + return needle in self._childs + + def __iter__(self) -> typing.Iterator[typing.Any]: + return iter(self._childs) + + def __len__(self) -> int: + return len(self._childs) + + def __repr__(self) -> str: + return f'{type(self).__name__}({self._childs})' + + def __hash__(self) -> int: + return hash((type(self), tuple(set(self._childs)))) + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) \ + and self._childs == other._childs + + @abc.abstractmethod + def __call__(self, path: str) -> bool: # pylint: disable=arguments-differ + """Check if *path* satisfies the conditions set by the Matcher instance.""" + +class NOT(Matcher): + """Invert a matcher result.""" + def __init__(self, expr: Matcher): + super().__init__(expr) + def __call__(self, path: str) -> bool: + return not next(iter(self._childs))(path) + +# aggregate nodes + +class Aggregate(Matcher): # pylint: disable=too-few-public-methods # Yeah, it's an interface... + """Aggregation function base class (And, Or).""" + +class And(Aggregate): + """Accept only if all conditions are satisfied.""" + def __call__(self, path: str) -> bool: + for itm in self: + if not itm(path): + return False + return True + +class Or(Aggregate): + """Accept only if at least one condition is satisfied.""" + def __call__(self, path: str) -> bool: + for itm in self: + if itm(path): + return True + return False + + +# criteria nodes + +class Criterion(Matcher): + """Criterion base class. Limits acceptance to certain values.""" + def accepted(self) -> typing.Set[typing.Any]: + """Return a set of accepted values.""" + return self._childs + +# criteria w/o value (valueless) + +class Any(Criterion): + """Accepts anything.""" + def __call__(self, path: str) -> bool: + return True + +class Nothing(Criterion): + """Accepts nothing.""" + def __call__(self, path: str) -> bool: + return False + +class Exists(Criterion): + """Filters by existence.""" + def __call__(self, path: str) -> bool: + return os.path.exists(path) + +class IsFile(Criterion): + """Checks if the path is a regular file.""" + def __call__(self, path: str) -> bool: + return os.path.isfile(path) + +class IsDir(Criterion): + """Checks if the path is a directory.""" + def __call__(self, path: str) -> bool: + return os.path.isdir(path) + +class IsLink(Criterion): + """Checks if the path is a link.""" + def __call__(self, path: str) -> bool: + return os.path.islink(path) + +class IsAbs(Criterion): + """Checks if the path is an absolute path.""" + def __call__(self, path: str) -> bool: + return os.path.isabs(path) + +class IsRel(Criterion): + """Checks if the path is a relative path.""" + def __call__(self, path: str) -> bool: + return not os.path.isabs(path) + +class IsMount(Criterion): + """Checks if the path is a mount point.""" + def __call__(self, path: str) -> bool: + return os.path.ismount(path) + +class IsEmpty(Criterion): + """Checks if the path is an empty file.""" + def __call__(self, path: str) -> bool: + return os.path.exists(path) and os.stat(path).st_size == 0 + +class IsReadable(Criterion): + """Checks if the path is readable.""" + def __call__(self, path: str) -> bool: + return os.path.exists(path) and os.access(path, os.R_OK) + +class IsWritable(Criterion): + """Checks if the path is writable.""" + def __call__(self, path: str) -> bool: + return os.path.exists(path) and os.access(path, os.W_OK) + +class IsExecutable(Criterion): + """Checks if the path is executable.""" + def __call__(self, path: str) -> bool: + return os.path.exists(path) and os.access(path, os.X_OK) + +# criteria w/ value + +class Extension(Criterion): + """Filters by file extension (without the dot).""" + def __call__(self, path: str) -> bool: + _, ext = os.path.splitext(path) + return ext[1:] in self.accepted() + +class Mime(Criterion): + """Filters by mime type.""" + def __call__(self, path: str) -> bool: + try: + return magic.from_file(path, mime=True).lower() in self.accepted() + except FileNotFoundError: + return False + +## EOF ## diff --git a/bsie/utils/filematcher/parser.py b/bsie/utils/filematcher/parser.py new file mode 100644 index 0000000..dc28a0d --- /dev/null +++ b/bsie/utils/filematcher/parser.py @@ -0,0 +1,141 @@ + +# standard imports +import typing + +# external imports +import pyparsing +from pyparsing import printables, alphas8bit, punc8bit, QuotedString, Word, \ + delimitedList, Or, CaselessKeyword, Group, oneOf, Optional + +# inner-module imports +from . import matcher +from .. import errors + +# exports +__all__: typing.Sequence[str] = ( + 'parse', + ) + + +## code ## + +class FileMatcherParser(): + """ + EXPR := RULES | RULES "|" RULES + RULESET := RULE | RULE, RULE + RULE := CRITERION OP VALUE | CRITERION OP {VALUES} | VALUELESS + OP := != | = + VALUES := VALUE | VALUE, VALUE + VALUE := [word] + CRITERION := mime | extension | ... + """ + + # criteria matcher nodes w/ arguments + _CRITERIA: typing.Dict[str, typing.Type[matcher.Matcher]] = { + 'extension': matcher.Extension, + 'mime': matcher.Mime, + } + + # criteria matcher nodes w/o arguments + _VALUELESS: typing.Dict[str, typing.Type[matcher.Matcher]] = { + 'any': matcher.Any, + 'nothing': matcher.Nothing, + 'exists': matcher.Exists, + 'isfile': matcher.IsFile, + 'isdir': matcher.IsDir, + 'islink': matcher.IsLink, + 'isabs': matcher.IsAbs, + 'isrel': matcher.IsRel, + 'ismount': matcher.IsMount, + 'emtpy': matcher.IsEmpty, + 'readable': matcher.IsReadable, + 'writable': matcher.IsWritable, + 'executable': matcher.IsExecutable, + } + + # pyparsing parser instance. + _parser: pyparsing.ParseExpression + + def __init__(self): + # build the parser + # VALUE := [word] + alphabet = (printables + alphas8bit + punc8bit).translate(str.maketrans('', '', ',{}|=')) + value = QuotedString(quoteChar='"', escChar='\\') ^ Word(alphabet) + # CRITERION := mime | extension | ... + criterion = Or([CaselessKeyword(p) for p in self._CRITERIA]).setResultsName('criterion') + valueless = Or([CaselessKeyword(p) for p in self._VALUELESS]).setResultsName('criterion') + # VALUES := VALUE | VALUE, VALUE + values = delimitedList(value, delim=',').setResultsName('value') + # OP := '=' | '!=' + eqop = oneOf('= !=').setResultsName('op') + # RULE := CRITERION OP VALUE | CRITERION OP {VALUES} | VALUELESS + rule_none = Group(Optional('!').setResultsName('op') + valueless).setResultsName('rule_none') + rule_one = Group(criterion + eqop + value.setResultsName('value')).setResultsName('rule_one') + rule_few = Group(criterion + eqop + '{' + values + '}').setResultsName('rule_few') + # RULESET := RULE | RULE, RULE + ruleset = Group(delimitedList(rule_none ^ rule_one ^ rule_few, delim=',')) + # EXPR := RULESET | RULESET \| RULESET + self._parser = delimitedList(ruleset, delim='|') + + def parse(self, query: str) -> matcher.Matcher: # pylint: disable=too-many-branches + """Build a file matcher from a rule definition.""" + # preprocess the query + query = query.strip() + + # empty query + if len(query) == 0: + return matcher.Any() + + try: + parsed = self._parser.parseString(query, parseAll=True) + except pyparsing.ParseException as err: + raise errors.ParserError(f'Cannot parse query {err}') + + # convert to Matcher + rules = [] + for exp in parsed: + tokens = [] + for rule in exp: + # fetch accepted values + if rule.getName() == 'rule_none': + accepted = [] + elif rule.getName() == 'rule_one': + accepted = [rule.value] + elif rule.getName() == 'rule_few': + accepted = list(rule.value) + else: # prevented by grammar + raise errors.UnreachableError('Invalid rule definition') + + # build criterion + if rule.criterion in self._VALUELESS: + cls = self._VALUELESS[rule.criterion] + if rule.op == '!': + tokens.append(matcher.NOT(cls())) + else: + tokens.append(cls()) + elif rule.criterion in self._CRITERIA: + cls = self._CRITERIA[rule.criterion] + if rule.op == '!=': + tokens.append(matcher.NOT(cls(accepted))) + else: + tokens.append(cls(accepted)) + else: # prevented by grammar + raise errors.UnreachableError(f'Invalid condition "{rule.criterion}"') + + # And-aggregate rules in one ruleset (if needed) + tokens = matcher.And(tokens) if len(tokens) > 1 else tokens[0] + rules.append(tokens) + + # Or-aggregate rulesets + expr = matcher.Or(rules) if len(rules) > 1 else rules[0] + + return expr + +# build default instance +file_match_parser = FileMatcherParser() + +def parse(query: str) -> matcher.Matcher: + """Shortcut for FileMatcherParser()(query).""" + return file_match_parser.parse(query) + +## EOF ## diff --git a/bsie/utils/loading.py b/bsie/utils/loading.py new file mode 100644 index 0000000..58202d1 --- /dev/null +++ b/bsie/utils/loading.py @@ -0,0 +1,49 @@ + +# standard imports +import importlib +import typing + +# inner-module imports +from . import errors + +# exports +__all__: typing.Sequence[str] = ( + 'safe_load', + 'unpack_qualified_name', + ) + + +## code ## + +def safe_load(module_name: str, class_name: str): + """Get a class from a module. Raise BuilderError if anything goes wrong.""" + try: + # load the module + module = importlib.import_module(module_name) + except Exception as err: + # cannot import module + raise errors.LoaderError(f'cannot load module {module_name} ({err})') from err + + try: + # get the class from the module + cls = getattr(module, class_name) + except Exception as err: + # cannot find the class + raise errors.LoaderError(f'cannot load class {class_name} from module {module_name} ({err})') from err + + return cls + + +def unpack_qualified_name(name): + """Split a name into its module and class component (dot-separated).""" + if not isinstance(name, str): + raise TypeError(name) + if '.' not in name: + raise ValueError('name must be a qualified class name.') + module_name, class_name = name[:name.rfind('.')], name[name.rfind('.')+1:] + if module_name == '': + raise ValueError('name must be a qualified class name.') + return module_name, class_name + + +## EOF ## diff --git a/bsie/utils/namespaces.py b/bsie/utils/namespaces.py index a29fc1b..4a66048 100644 --- a/bsie/utils/namespaces.py +++ b/bsie/utils/namespaces.py @@ -1,26 +1,37 @@ """Default namespaces used throughout BSIE. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # inner-module imports from . import bsfs as _bsfs -# constants -bse = _bsfs.Namespace('http://bsfs.ai/schema/Entity') -bsfs = _bsfs.Namespace('http://bsfs.ai/schema', fsep='/') -bsm = _bsfs.Namespace('http://bsfs.ai/schema/Meta') -xsd = _bsfs.Namespace('http://www.w3.org/2001/XMLSchema') +# generic namespaces +xsd = _bsfs.Namespace('http://www.w3.org/2001/XMLSchema')() + +# core bsfs/bsie namespaces +bsfs = _bsfs.Namespace('https://schema.bsfs.io/core') +bsie = _bsfs.Namespace('https://schema.bsfs.io/ie') + +# auxiliary namespaces +bsd = bsie.distance() +bse = bsie.Node.Entity() +bsf = bsie.Literal.Array.Feature +bsl = bsfs.Literal +bsn = bsie.Node +bsp = bsie.Node.Preview() # export __all__: typing.Sequence[str] = ( + 'bsd', 'bse', + 'bsf', 'bsfs', - 'bsm', + 'bsie', + 'bsl', + 'bsl', + 'bsn', + 'bsp', 'xsd', ) diff --git a/bsie/utils/node.py b/bsie/utils/node.py index ecf39cd..fa34b2e 100644 --- a/bsie/utils/node.py +++ b/bsie/utils/node.py @@ -1,10 +1,6 @@ """Lighweight Node to bridge to BSFS. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # bsie imports @@ -19,30 +15,47 @@ __all__: typing.Sequence[str] = ( ## code ## class Node(): - """Lightweight Node, disconnected from any bsfs structures.""" + """Lightweight Node, disconnected from any bsfs structures. + + In most cases, provide *hints* and leave setting the uri to a node + naming policy. Only provide an *uri* if it is absolutely determined. + + """ # node type. node_type: bsfs.URI # node URI. - uri: bsfs.URI + uri: typing.Optional[bsfs.URI] + + # node naming hints. + hits: dict def __init__( self, node_type: bsfs.URI, - uri: bsfs.URI, + uri: typing.Optional[bsfs.URI] = None, + **uri_hints, ): # assign members self.node_type = bsfs.URI(node_type) - self.uri = bsfs.URI(uri) + self.hints = uri_hints + self.uri = uri def __eq__(self, other: typing.Any) -> bool: + """Compare two Node instances based on type and uri. + Compares hits only if the uri is not yet specified. + """ return isinstance(other, Node) \ and other.node_type == self.node_type \ - and other.uri == self.uri + and other.uri == self.uri \ + and (self.uri is not None or self.hints == other.hints) def __hash__(self) -> int: - return hash((type(self), self.node_type, self.uri)) + identifier = self.uri + if identifier is None: + identifier = tuple((key, self.hints[key]) for key in sorted(self.hints)) + return hash((type(self), self.node_type, identifier)) def __str__(self) -> str: return f'{bsfs.typename(self)}({self.node_type}, {self.uri})' |