diff options
author | Matthias Baumgartner <dev@igsor.net> | 2023-04-17 18:47:58 +0200 |
---|---|---|
committer | Matthias Baumgartner <dev@igsor.net> | 2023-04-17 18:47:58 +0200 |
commit | be6027859c815e18b08a49ca1a45df3fc0aac301 (patch) | |
tree | e978249655fcab58f9ee1479c268ca8b06af7e8d /bsie | |
parent | af81318ae9311fd0b0e16949cef3cfaf7996970b (diff) | |
parent | aefd0cb4fa1a949beabc51e88a5c46843043a439 (diff) | |
download | bsie-be6027859c815e18b08a49ca1a45df3fc0aac301.tar.gz bsie-be6027859c815e18b08a49ca1a45df3fc0aac301.tar.bz2 bsie-be6027859c815e18b08a49ca1a45df3fc0aac301.zip |
Merge branch 'mb/iptc' into develop
Diffstat (limited to 'bsie')
-rw-r--r-- | bsie/apps/_loader.py | 6 | ||||
-rw-r--r-- | bsie/apps/index.py | 23 | ||||
-rw-r--r-- | bsie/apps/info.py | 2 | ||||
-rw-r--r-- | bsie/extractor/builder.py | 3 | ||||
-rw-r--r-- | bsie/extractor/image/iptc.py | 70 | ||||
-rw-r--r-- | bsie/lib/naming_policy.py | 27 | ||||
-rw-r--r-- | bsie/reader/exif.py | 21 | ||||
-rw-r--r-- | bsie/utils/__init__.py | 1 | ||||
-rw-r--r-- | bsie/utils/filewalker.py | 31 | ||||
-rw-r--r-- | bsie/utils/namespaces.py | 2 |
10 files changed, 157 insertions, 29 deletions
diff --git a/bsie/apps/_loader.py b/bsie/apps/_loader.py index 6411f10..d9ea9bb 100644 --- a/bsie/apps/_loader.py +++ b/bsie/apps/_loader.py @@ -1,5 +1,6 @@ # standard imports +import os import typing # external imports @@ -12,8 +13,7 @@ from bsie.lib.pipeline import Pipeline from bsie.reader import ReaderBuilder # constants -DEFAULT_CONFIG_FILE = 'default_config.yaml' - +DEFAULT_CONFIG_FILE = os.path.join(os.path.dirname(__file__), 'default_config.yaml') # exports __all__: typing.Sequence[str] = ( 'DEFAULT_CONFIG_FILE', @@ -23,7 +23,7 @@ __all__: typing.Sequence[str] = ( ## code ## -def load_pipeline(path: str) -> Pipeline: +def load_pipeline(path: str = DEFAULT_CONFIG_FILE) -> Pipeline: """Load a pipeline according to a config at *path*.""" # load config file with open(path, 'rt', encoding='utf-8') as ifile: diff --git a/bsie/apps/index.py b/bsie/apps/index.py index d64e8c2..7dda6f4 100644 --- a/bsie/apps/index.py +++ b/bsie/apps/index.py @@ -6,7 +6,7 @@ import typing # bsie imports from bsie.lib import BSIE, DefaultNamingPolicy -from bsie.utils import bsfs, errors, node as node_ +from bsie.utils import bsfs, errors, node as node_, list_files # inner-module imports from . import _loader @@ -23,7 +23,7 @@ def main(argv): """Index files or directories into BSFS.""" parser = argparse.ArgumentParser(description=main.__doc__, prog='index') parser.add_argument('--config', type=str, - default=os.path.join(os.path.dirname(__file__), _loader.DEFAULT_CONFIG_FILE), + default=_loader.DEFAULT_CONFIG_FILE, help='Path to the config file.') parser.add_argument('--host', type=bsfs.URI, default=bsfs.URI('http://example.com'), help='') @@ -59,22 +59,9 @@ def main(argv): # FIXME: simplify code (below but maybe also above) # FIXME: How to handle dependencies between data? # E.g. do I still want to link to a tag despite not being permitted to set its label? - - # index input paths - for path in args.input_file: - if not os.path.exists(path): - pass # FIXME: notify the user - elif os.path.isdir(path) and args.recursive: - for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow): - for filename in filenames: - for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)): - handle(node, pred, value) - elif os.path.isfile(path): - for node, pred, value in bsie.from_file(path): - handle(node, pred, value) - else: - raise errors.UnreachableError() - + for path in list_files(args.input_file, args.recursive, args.follow): + for node, pred, value in bsie.from_file(path): + handle(node, pred, value) if args.print: walk(print) diff --git a/bsie/apps/info.py b/bsie/apps/info.py index e27b70b..b6494da 100644 --- a/bsie/apps/info.py +++ b/bsie/apps/info.py @@ -23,7 +23,7 @@ def main(argv): """Show information from BSIE.""" parser = argparse.ArgumentParser(description=main.__doc__, prog='info') parser.add_argument('--config', type=str, - default=os.path.join(os.path.dirname(__file__), _loader.DEFAULT_CONFIG_FILE), + default=_loader.DEFAULT_CONFIG_FILE, help='Path to the config file.') parser.add_argument('what', choices=('predicates', 'schema'), help='Select what information to show.') diff --git a/bsie/extractor/builder.py b/bsie/extractor/builder.py index d691b0e..8353a93 100644 --- a/bsie/extractor/builder.py +++ b/bsie/extractor/builder.py @@ -67,6 +67,7 @@ class ExtractorBuilder(): return cls(**kwargs) except Exception as err: - raise errors.BuilderError(f'failed to build extractor {name} due to {bsfs.typename(err)}: {err}') from err + raise errors.BuilderError( + f'failed to build extractor {name} due to {bsfs.typename(err)}: {err}') from err ## EOF ## diff --git a/bsie/extractor/image/iptc.py b/bsie/extractor/image/iptc.py new file mode 100644 index 0000000..195eff7 --- /dev/null +++ b/bsie/extractor/image/iptc.py @@ -0,0 +1,70 @@ + +# standard imports +import typing + +# bsie imports +from bsie.utils import bsfs, node, ns + +# inner-module imports +from .. import base + +# exports +__all__: typing.Sequence[str] = ( + 'Iptc', + ) + + +## code ## + +class Iptc(base.Extractor): + """Turn IPTC keywords into tags.""" + + CONTENT_READER = 'bsie.reader.exif.Iptc' + + def __init__(self): + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + ''' + bsn:Tag rdfs:subClassOf bsfs:Node . + + bse:tag rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range bsn:Tag . + + <https://schema.bsfs.io/ie/Node/Tag#label> rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Tag ; + rdfs:range xsd:string ; + bsfs:unique "true"^^xsd:boolean . + + ''')) + self._callmap = { + self.schema.predicate(ns.bse.tag): self._keywords, + } + + def extract( + self, + subject: node.Node, + content: dict, + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + for pred in principals: + # find callback + clbk = self._callmap.get(pred) + if clbk is None: + continue + # produce triples + yield from clbk(subject, content) + + def _keywords( + self, + subject: node.Node, + content: dict, + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + if 'Iptc.Application2.Keywords' not in content: + return + for keyword in content['Iptc.Application2.Keywords']: + tag = node.Node(ns.bsn.Tag, label=keyword) + yield subject, self.schema.predicate(ns.bse.tag), tag + yield tag, self.schema.predicate(ns.bst.label), keyword + + + +## EOF ## diff --git a/bsie/lib/naming_policy.py b/bsie/lib/naming_policy.py index 9b9a45d..ffef7d9 100644 --- a/bsie/lib/naming_policy.py +++ b/bsie/lib/naming_policy.py @@ -4,6 +4,9 @@ import abc import os import typing +# external imports +import urllib.parse + # bsie imports from bsie.utils import bsfs, errors, ns from bsie.utils.node import Node @@ -80,14 +83,16 @@ class DefaultNamingPolicy(NamingPolicy): def handle_node(self, node: Node) -> Node: if node.uri is not None: return node - if node.node_type == ns.bsn.Entity : - return self.name_file(node) + if node.node_type == ns.bsn.Entity: + return self.name_entity(node) if node.node_type == ns.bsn.Preview: return self.name_preview(node) - raise errors.ProgrammingError('no naming policy available for {node.node_type}') + if node.node_type == ns.bsn.Tag: + return self.name_tag(node) + raise errors.ProgrammingError(f'no naming policy available for {node.node_type}') - def name_file(self, node: Node) -> Node: - """Set a bsfs:File node's uri fragment to its ucid.""" + def name_entity(self, node: Node) -> Node: + """Set a bsn:Entity node's uri fragment to its ucid.""" if 'ucid' in node.hints: # content id fragment = node.hints['ucid'] else: # random name @@ -96,7 +101,7 @@ class DefaultNamingPolicy(NamingPolicy): return node def name_preview(self, node: Node) -> Node: - """Set a bsfs:Preview node's uri fragment to its ucid. + """Set a bsn:Preview node's uri fragment to its ucid. Uses its source fragment as fallback. Appends the size if provided. """ fragment = None @@ -112,4 +117,14 @@ class DefaultNamingPolicy(NamingPolicy): node.uri = getattr(self._prefix.preview(), fragment) return node + def name_tag(self, node: Node) -> Node: + # NOTE: Must ensure to produce the same name for that tags with the same label. + if 'label' in node.hints: # tag label + fragment = urllib.parse.quote(node.hints['label']) + else: # random name + fragment = self._uuid() + # FIXME: match to existing tags in bsfs storage! + node.uri = getattr(self._prefix.tag(), fragment) + return node + ## EOF ## diff --git a/bsie/reader/exif.py b/bsie/reader/exif.py index 2d0428b..7ec7574 100644 --- a/bsie/reader/exif.py +++ b/bsie/reader/exif.py @@ -17,6 +17,7 @@ MATCH_RULE = 'mime=image/jpeg' # exports __all__: typing.Sequence[str] = ( 'Exif', + 'Iptc', ) @@ -41,4 +42,24 @@ class Exif(base.Reader): except (TypeError, OSError, RuntimeError) as err: raise errors.ReaderError(path) from err + +class Iptc(base.Reader): + """Use pyexiv2 to read iptc metadata from image files.""" + + def __init__(self): + self._match = filematcher.parse(MATCH_RULE) + + def __call__(self, path: str) -> dict: + # perform quick checks first + if not self._match(path): + raise errors.UnsupportedFileFormatError(path) + + try: + # open the file + img = pyexiv2.Image(path) + # read metadata + return img.read_iptc() + except (TypeError, OSError, RuntimeError) as err: + raise errors.ReaderError(path) from err + ## EOF ## diff --git a/bsie/utils/__init__.py b/bsie/utils/__init__.py index 18c8db7..4f08604 100644 --- a/bsie/utils/__init__.py +++ b/bsie/utils/__init__.py @@ -8,6 +8,7 @@ from . import bsfs from . import filematcher from . import namespaces as ns from . import node +from .filewalker import list_files from .loading import safe_load, unpack_qualified_name # exports diff --git a/bsie/utils/filewalker.py b/bsie/utils/filewalker.py new file mode 100644 index 0000000..3c36926 --- /dev/null +++ b/bsie/utils/filewalker.py @@ -0,0 +1,31 @@ + +# standard imports +import os +import typing + +# exports +__all__: typing.Sequence[str] = ( + 'list_files', + ) + + +## code ## + +def list_files( + roots: typing.Iterable[str], + recursive: bool = True, + follow_symlinks: bool = True, + ) -> typing.Iterator[str]: + """Iterate over all files in *roots*, recursively by default.""" + # index input paths + for path in roots: + if not os.path.exists(path): + continue + elif os.path.isdir(path) and recursive: + for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=follow_symlinks): + for filename in filenames: + yield os.path.join(dirpath, filename) + elif os.path.isfile(path): + yield path + +## EOF ## diff --git a/bsie/utils/namespaces.py b/bsie/utils/namespaces.py index 4a66048..9357253 100644 --- a/bsie/utils/namespaces.py +++ b/bsie/utils/namespaces.py @@ -20,6 +20,7 @@ bsf = bsie.Literal.Array.Feature bsl = bsfs.Literal bsn = bsie.Node bsp = bsie.Node.Preview() +bst = bsie.Node.Tag() # export __all__: typing.Sequence[str] = ( @@ -32,6 +33,7 @@ __all__: typing.Sequence[str] = ( 'bsl', 'bsn', 'bsp', + 'bst', 'xsd', ) |