diff options
Diffstat (limited to 'bsie/extractor')
-rw-r--r-- | bsie/extractor/__init__.py | 11 | ||||
-rw-r--r-- | bsie/extractor/base.py | 113 | ||||
-rw-r--r-- | bsie/extractor/builder.py | 77 | ||||
-rw-r--r-- | bsie/extractor/generic/__init__.py | 2 | ||||
-rw-r--r-- | bsie/extractor/generic/constant.py | 10 | ||||
-rw-r--r-- | bsie/extractor/generic/path.py | 8 | ||||
-rw-r--r-- | bsie/extractor/generic/stat.py | 10 | ||||
-rw-r--r-- | bsie/extractor/image/__init__.py | 13 | ||||
-rw-r--r-- | bsie/extractor/image/colors_spatial.py | 154 |
9 files changed, 383 insertions, 15 deletions
diff --git a/bsie/extractor/__init__.py b/bsie/extractor/__init__.py index ef31343..5f385ee 100644 --- a/bsie/extractor/__init__.py +++ b/bsie/extractor/__init__.py @@ -6,10 +6,17 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing +# inner-module imports +from .base import Extractor +from .builder import ExtractorBuilder + # exports -__all__: typing.Sequence[str] = [] +__all__: typing.Sequence[str] = ( + 'Extractor', + 'ExtractorBuilder', + ) ## EOF ## diff --git a/bsie/extractor/base.py b/bsie/extractor/base.py new file mode 100644 index 0000000..7401244 --- /dev/null +++ b/bsie/extractor/base.py @@ -0,0 +1,113 @@ +"""The Extractor classes transform content into triples. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# standard imports +import abc +import typing + +# bsie imports +from bsie.utils import bsfs, node, ns + +# exports +__all__: typing.Sequence[str] = ( + 'Extractor', + ) + +# constants + +# essential definitions typically used in extractor schemas. +# NOTE: This preamble is only for convenience; Each Extractor must implement its use, if so desired. +SCHEMA_PREAMBLE = ''' + # common external prefixes + prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> + prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> + prefix xsd: <http://www.w3.org/2001/XMLSchema#> + prefix schema: <http://schema.org/> + + # common bsfs prefixes + prefix bsfs: <http://bsfs.ai/schema/> + prefix bse: <http://bsfs.ai/schema/Entity#> + + # default definitions + bsfs:Array rdfs:subClassOf bsfs:Literal . + bsfs:Number rdfs:subClassOf bsfs:Literal . + bsfs:Time rdfs:subClassOf bsfs:Literal . + bsfs:Feature rdfs:subClassOf bsfs:Array ; + bsfs:dimension "1"^^xsd:integer ; + bsfs:dtype bsfs:f16 ; + bsfs:distance bsfs:euclidean . + + # essential nodes + bsfs:Entity rdfs:subClassOf bsfs:Node . + bsfs:File rdfs:subClassOf bsfs:Entity . + + # common definitions + xsd:string rdfs:subClassOf bsfs:Literal . + xsd:integer rdfs:subClassOf bsfs:Number . + + ''' + + +## code ## + +class Extractor(abc.ABC): + """Produce (subject, predicate, value)-triples from some content. + The Extractor produces princpal predicates that provide information + about the content itself (i.e., triples that include the subject), + and may also generate triples with auxiliary predicates if the + extracted value is a node itself. + """ + + # what type of content is expected (i.e. reader subclass). + CONTENT_READER: typing.Optional[str] = None + + # extractor schema. + _schema: bsfs.schema.Schema + + def __init__(self, schema: bsfs.schema.Schema): + self._schema = schema + + def __str__(self) -> str: + return bsfs.typename(self) + + def __repr__(self) -> str: + return f'{bsfs.typename(self)}()' + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) \ + and self.CONTENT_READER == other.CONTENT_READER \ + and self.schema == other.schema + + def __hash__(self) -> int: + return hash((type(self), self.CONTENT_READER, self.schema)) + + @property + def schema(self) -> bsfs.schema.Schema: + """Return the extractor's schema.""" + return self._schema + + @property + def principals(self) -> typing.Iterator[bsfs.schema.Predicate]: + """Return the principal predicates, i.e., relations from/to the extraction subject.""" + ent = self.schema.node(ns.bsfs.Entity) + return ( + pred + for pred + in self.schema.predicates() + if pred.domain <= ent or (pred.range is not None and pred.range <= ent) + ) + + @abc.abstractmethod + def extract( + self, + subject: node.Node, + content: typing.Any, + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + """Return (node, predicate, value) triples.""" + # FIXME: type annotation could be more strict: value is Hashable + +## EOF ## diff --git a/bsie/extractor/builder.py b/bsie/extractor/builder.py new file mode 100644 index 0000000..0fd3685 --- /dev/null +++ b/bsie/extractor/builder.py @@ -0,0 +1,77 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# standard imports +import typing + +# bsie imports +from bsie.utils import bsfs, errors, safe_load, unpack_qualified_name + +# inner-module imports +from . import base + +# exports +__all__: typing.Sequence[str] = ( + 'ExtractorBuilder', + ) + + +## code ## + +class ExtractorBuilder(): + """Build `bsie.base.Extractor instances. + + It is permissible to build multiple instances of the same extractor + (typically with different arguments), hence the ExtractorBuilder + receives a list of build specifications. Each specification is + a dict with a single key (extractor's qualified name) and a dict + to be used as keyword arguments. + Example: [{'bsie.extractor.generic.path.Path': {}}, ] + + """ + + # build specifications + _specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]] + + def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]): + self._specs = specs + + def __iter__(self) -> typing.Iterator[int]: + """Iterate over extractor specifications.""" + return iter(range(len(self._specs))) + + def build(self, index: int) -> base.Extractor: + """Return an instance of the n'th extractor (n=*index*).""" + # get build instructions + specs = self._specs[index] + + # check specs structure. expecting[{name: {kwargs}}] + if not isinstance(specs, dict): + raise TypeError(f'expected a dict, found {bsfs.typename(specs)}') + if len(specs) != 1: + raise TypeError(f'expected a dict of length one, found {len(specs)}') + + # get name and args from specs + name = next(iter(specs.keys())) + kwargs = specs[name] + + # check kwargs structure + if not isinstance(kwargs, dict): + raise TypeError(f'expected a dict, found {bsfs.typename(kwargs)}') + + # check name and get module/class components + module_name, class_name = unpack_qualified_name(name) + + # import extractor class + cls = safe_load(module_name, class_name) + + try: # build and return instance + return cls(**kwargs) + + except Exception as err: + raise errors.BuilderError(f'failed to build extractor {name} due to {bsfs.typename(err)}: {err}') from err + +## EOF ## diff --git a/bsie/extractor/generic/__init__.py b/bsie/extractor/generic/__init__.py index 0cb7e7f..4783949 100644 --- a/bsie/extractor/generic/__init__.py +++ b/bsie/extractor/generic/__init__.py @@ -7,7 +7,7 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # exports diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py index 11384e6..938e20c 100644 --- a/bsie/extractor/generic/constant.py +++ b/bsie/extractor/generic/constant.py @@ -4,13 +4,15 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # bsie imports -from bsie.base import extractor from bsie.utils import bsfs, node +# inner-module imports +from .. import base + # exports __all__: typing.Sequence[str] = ( 'Constant', @@ -19,7 +21,7 @@ __all__: typing.Sequence[str] = ( ## code ## -class Constant(extractor.Extractor): +class Constant(base.Extractor): """Extract information from file's path.""" CONTENT_READER = None @@ -32,7 +34,7 @@ class Constant(extractor.Extractor): schema: str, tuples: typing.Iterable[typing.Tuple[bsfs.URI, typing.Any]], ): - super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + schema)) + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + schema)) # NOTE: Raises a KeyError if the predicate is not part of the schema self._tuples = tuple((self.schema.predicate(p_uri), value) for p_uri, value in tuples) # TODO: use schema instance for value checking diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py index 7018e12..c984515 100644 --- a/bsie/extractor/generic/path.py +++ b/bsie/extractor/generic/path.py @@ -4,12 +4,12 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import os import typing # bsie imports -from bsie.base import extractor +from bsie.extractor import base from bsie.utils import bsfs, node, ns # exports @@ -20,7 +20,7 @@ __all__: typing.Sequence[str] = ( ## code ## -class Path(extractor.Extractor): +class Path(base.Extractor): """Extract information from file's path.""" CONTENT_READER = 'bsie.reader.path.Path' @@ -29,7 +29,7 @@ class Path(extractor.Extractor): _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]] def __init__(self): - super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + ''' bse:filename rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:File ; rdfs:range xsd:string ; diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py index 0b9ce29..9394456 100644 --- a/bsie/extractor/generic/stat.py +++ b/bsie/extractor/generic/stat.py @@ -4,14 +4,16 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import os import typing # bsie imports -from bsie.base import extractor from bsie.utils import bsfs, node, ns +# inner-module imports +from .. import base + # exports __all__: typing.Sequence[str] = ( 'Stat', @@ -20,7 +22,7 @@ __all__: typing.Sequence[str] = ( ## code ## -class Stat(extractor.Extractor): +class Stat(base.Extractor): """Extract information from the file system.""" CONTENT_READER = 'bsie.reader.stat.Stat' @@ -29,7 +31,7 @@ class Stat(extractor.Extractor): _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[os.stat_result], typing.Any]] def __init__(self): - super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + ''' + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + ''' bse:filesize rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:File ; rdfs:range xsd:integer ; diff --git a/bsie/extractor/image/__init__.py b/bsie/extractor/image/__init__.py new file mode 100644 index 0000000..75b118d --- /dev/null +++ b/bsie/extractor/image/__init__.py @@ -0,0 +1,13 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# standard imports +import typing + +# exports +__all__: typing.Sequence[str] = [] + +## EOF ## diff --git a/bsie/extractor/image/colors_spatial.py b/bsie/extractor/image/colors_spatial.py new file mode 100644 index 0000000..ce5b9f2 --- /dev/null +++ b/bsie/extractor/image/colors_spatial.py @@ -0,0 +1,154 @@ +"""Spatial color features. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# standard imports +import typing + +# external imports +import PIL.Image +import numpy as np + +# bsie imports +from bsie.utils import bsfs, node, ns + +# inner-module imports +from .. import base + +# constants +FEATURE_NAME = ns.bsf + 'ColorsSpatial' +PREDICATE_NAME = ns.bse + 'colors_spatial' + +# exports +__all__: typing.Sequence[str] = ( + 'ColorsSpatial', + ) + + +## code ## + +class ColorsSpatial(base.Extractor): + """Determine dominant colors of subregions in the image. + + Computes the domiant color of increasingly smaller subregions of the image. + """ + + CONTENT_READER = 'bsie.reader.image.Image' + + # Initial subregion width. + width: int + + # Initial subregion height. + height: int + + # Decrement exponent. + exp: float + + # Principal predicate's URI. + _predicate_name: bsfs.URI + + def __init__( + self, + width: int = 32, + height: int = 32, + exp: float = 4., + ): + # instance identifier + uuid = bsfs.uuid.UCID.from_dict({ + 'width': width, + 'height': height, + 'exp': exp, + }) + # determine symbol names + instance_name = FEATURE_NAME[uuid] + predicate_name = PREDICATE_NAME[uuid] + # get vector dimension + dimension = self.dimension(width, height, exp) + # initialize parent with the schema + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + f''' + <{FEATURE_NAME}> rdfs:subClassOf bsfs:Feature ; + # annotations + rdfs:label "Spatially dominant colors"^^xsd:string ; + schema:description "Domiant colors of subregions in an image."^^xsd:string ; + bsfs:dtype xsd:integer . + + <{instance_name}> rdfs:subClassOf <{FEATURE_NAME}> ; + bsfs:dimension "{dimension}"^^xsd:integer ; + # annotations + <{FEATURE_NAME}/args#width> "{width}"^^xsd:integer ; + <{FEATURE_NAME}/args#height> "{height}"^^xsd:integer ; + <{FEATURE_NAME}/args#exp> "{exp}"^^xsd:float . + + <{predicate_name}> rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsfs:File ; + rdfs:range <{instance_name}> ; + bsfs:unique "true"^^xsd:boolean . + + ''')) + # assign extra members + self.width = width + self.height = height + self.exp = exp + self._predicate_name = predicate_name + + def __repr__(self) -> str: + return f'{bsfs.typename(self)}({self.width}, {self.height}, {self.exp})' + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) \ + and self.width == other.width \ + and self.height == other.height \ + and self.exp == other.exp + + def __hash__(self) -> int: + return hash((super().__hash__(), self.width, self.height, self.exp)) + + @staticmethod + def dimension(width: int, height: int, exp: float) -> int: + """Return the feature vector dimension.""" + # FIXME: replace with a proper formula + dim = 0 + while width >= 1 and height >= 1: + dim += width * height + width = np.floor(width / exp) + height = np.floor(height / exp) + dim *= 3 # per band + return int(dim) + + def extract( + self, + subject: node.Node, + content: PIL.Image, + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + # check principals + if self.schema.predicate(self._predicate_name) not in principals: + # nothing to do; abort + return + + # convert to HSV + content = content.convert('HSV') + + # get dimensions + width, height = self.width, self.height + num_bands = len(content.getbands()) # it's three since we converted to HSV before + + features = [] + while width >= 1 and height >= 1: + # downsample + img = content.resize((width, height), resample=PIL.Image.Resampling.BOX) + # feature vector + features.append( + np.array(img.getdata()).reshape((width * height, num_bands))) + # iterate + width = int(np.floor(width / self.exp)) + height = int(np.floor(height / self.exp)) + + # combine bands and convert features to tuple + value = tuple(np.vstack(features).reshape(-1)) + # return triple with feature vector as value + yield subject, self.schema.predicate(self._predicate_name), value + +## EOF ## |