aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'bsie/extractor')
-rw-r--r--bsie/extractor/__init__.py11
-rw-r--r--bsie/extractor/base.py113
-rw-r--r--bsie/extractor/builder.py77
-rw-r--r--bsie/extractor/generic/__init__.py2
-rw-r--r--bsie/extractor/generic/constant.py10
-rw-r--r--bsie/extractor/generic/path.py8
-rw-r--r--bsie/extractor/generic/stat.py10
-rw-r--r--bsie/extractor/image/__init__.py13
-rw-r--r--bsie/extractor/image/colors_spatial.py154
9 files changed, 383 insertions, 15 deletions
diff --git a/bsie/extractor/__init__.py b/bsie/extractor/__init__.py
index ef31343..5f385ee 100644
--- a/bsie/extractor/__init__.py
+++ b/bsie/extractor/__init__.py
@@ -6,10 +6,17 @@ Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import typing
+# inner-module imports
+from .base import Extractor
+from .builder import ExtractorBuilder
+
# exports
-__all__: typing.Sequence[str] = []
+__all__: typing.Sequence[str] = (
+ 'Extractor',
+ 'ExtractorBuilder',
+ )
## EOF ##
diff --git a/bsie/extractor/base.py b/bsie/extractor/base.py
new file mode 100644
index 0000000..7401244
--- /dev/null
+++ b/bsie/extractor/base.py
@@ -0,0 +1,113 @@
+"""The Extractor classes transform content into triples.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# standard imports
+import abc
+import typing
+
+# bsie imports
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Extractor',
+ )
+
+# constants
+
+# essential definitions typically used in extractor schemas.
+# NOTE: This preamble is only for convenience; Each Extractor must implement its use, if so desired.
+SCHEMA_PREAMBLE = '''
+ # common external prefixes
+ prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+ prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+ prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+ prefix schema: <http://schema.org/>
+
+ # common bsfs prefixes
+ prefix bsfs: <http://bsfs.ai/schema/>
+ prefix bse: <http://bsfs.ai/schema/Entity#>
+
+ # default definitions
+ bsfs:Array rdfs:subClassOf bsfs:Literal .
+ bsfs:Number rdfs:subClassOf bsfs:Literal .
+ bsfs:Time rdfs:subClassOf bsfs:Literal .
+ bsfs:Feature rdfs:subClassOf bsfs:Array ;
+ bsfs:dimension "1"^^xsd:integer ;
+ bsfs:dtype bsfs:f16 ;
+ bsfs:distance bsfs:euclidean .
+
+ # essential nodes
+ bsfs:Entity rdfs:subClassOf bsfs:Node .
+ bsfs:File rdfs:subClassOf bsfs:Entity .
+
+ # common definitions
+ xsd:string rdfs:subClassOf bsfs:Literal .
+ xsd:integer rdfs:subClassOf bsfs:Number .
+
+ '''
+
+
+## code ##
+
+class Extractor(abc.ABC):
+ """Produce (subject, predicate, value)-triples from some content.
+ The Extractor produces princpal predicates that provide information
+ about the content itself (i.e., triples that include the subject),
+ and may also generate triples with auxiliary predicates if the
+ extracted value is a node itself.
+ """
+
+ # what type of content is expected (i.e. reader subclass).
+ CONTENT_READER: typing.Optional[str] = None
+
+ # extractor schema.
+ _schema: bsfs.schema.Schema
+
+ def __init__(self, schema: bsfs.schema.Schema):
+ self._schema = schema
+
+ def __str__(self) -> str:
+ return bsfs.typename(self)
+
+ def __repr__(self) -> str:
+ return f'{bsfs.typename(self)}()'
+
+ def __eq__(self, other: typing.Any) -> bool:
+ return isinstance(other, type(self)) \
+ and self.CONTENT_READER == other.CONTENT_READER \
+ and self.schema == other.schema
+
+ def __hash__(self) -> int:
+ return hash((type(self), self.CONTENT_READER, self.schema))
+
+ @property
+ def schema(self) -> bsfs.schema.Schema:
+ """Return the extractor's schema."""
+ return self._schema
+
+ @property
+ def principals(self) -> typing.Iterator[bsfs.schema.Predicate]:
+ """Return the principal predicates, i.e., relations from/to the extraction subject."""
+ ent = self.schema.node(ns.bsfs.Entity)
+ return (
+ pred
+ for pred
+ in self.schema.predicates()
+ if pred.domain <= ent or (pred.range is not None and pred.range <= ent)
+ )
+
+ @abc.abstractmethod
+ def extract(
+ self,
+ subject: node.Node,
+ content: typing.Any,
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ """Return (node, predicate, value) triples."""
+ # FIXME: type annotation could be more strict: value is Hashable
+
+## EOF ##
diff --git a/bsie/extractor/builder.py b/bsie/extractor/builder.py
new file mode 100644
index 0000000..0fd3685
--- /dev/null
+++ b/bsie/extractor/builder.py
@@ -0,0 +1,77 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# standard imports
+import typing
+
+# bsie imports
+from bsie.utils import bsfs, errors, safe_load, unpack_qualified_name
+
+# inner-module imports
+from . import base
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'ExtractorBuilder',
+ )
+
+
+## code ##
+
+class ExtractorBuilder():
+ """Build `bsie.base.Extractor instances.
+
+ It is permissible to build multiple instances of the same extractor
+ (typically with different arguments), hence the ExtractorBuilder
+ receives a list of build specifications. Each specification is
+ a dict with a single key (extractor's qualified name) and a dict
+ to be used as keyword arguments.
+ Example: [{'bsie.extractor.generic.path.Path': {}}, ]
+
+ """
+
+ # build specifications
+ _specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]
+
+ def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]):
+ self._specs = specs
+
+ def __iter__(self) -> typing.Iterator[int]:
+ """Iterate over extractor specifications."""
+ return iter(range(len(self._specs)))
+
+ def build(self, index: int) -> base.Extractor:
+ """Return an instance of the n'th extractor (n=*index*)."""
+ # get build instructions
+ specs = self._specs[index]
+
+ # check specs structure. expecting[{name: {kwargs}}]
+ if not isinstance(specs, dict):
+ raise TypeError(f'expected a dict, found {bsfs.typename(specs)}')
+ if len(specs) != 1:
+ raise TypeError(f'expected a dict of length one, found {len(specs)}')
+
+ # get name and args from specs
+ name = next(iter(specs.keys()))
+ kwargs = specs[name]
+
+ # check kwargs structure
+ if not isinstance(kwargs, dict):
+ raise TypeError(f'expected a dict, found {bsfs.typename(kwargs)}')
+
+ # check name and get module/class components
+ module_name, class_name = unpack_qualified_name(name)
+
+ # import extractor class
+ cls = safe_load(module_name, class_name)
+
+ try: # build and return instance
+ return cls(**kwargs)
+
+ except Exception as err:
+ raise errors.BuilderError(f'failed to build extractor {name} due to {bsfs.typename(err)}: {err}') from err
+
+## EOF ##
diff --git a/bsie/extractor/generic/__init__.py b/bsie/extractor/generic/__init__.py
index 0cb7e7f..4783949 100644
--- a/bsie/extractor/generic/__init__.py
+++ b/bsie/extractor/generic/__init__.py
@@ -7,7 +7,7 @@ Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import typing
# exports
diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py
index 11384e6..938e20c 100644
--- a/bsie/extractor/generic/constant.py
+++ b/bsie/extractor/generic/constant.py
@@ -4,13 +4,15 @@ Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import typing
# bsie imports
-from bsie.base import extractor
from bsie.utils import bsfs, node
+# inner-module imports
+from .. import base
+
# exports
__all__: typing.Sequence[str] = (
'Constant',
@@ -19,7 +21,7 @@ __all__: typing.Sequence[str] = (
## code ##
-class Constant(extractor.Extractor):
+class Constant(base.Extractor):
"""Extract information from file's path."""
CONTENT_READER = None
@@ -32,7 +34,7 @@ class Constant(extractor.Extractor):
schema: str,
tuples: typing.Iterable[typing.Tuple[bsfs.URI, typing.Any]],
):
- super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + schema))
+ super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + schema))
# NOTE: Raises a KeyError if the predicate is not part of the schema
self._tuples = tuple((self.schema.predicate(p_uri), value) for p_uri, value in tuples)
# TODO: use schema instance for value checking
diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py
index 7018e12..c984515 100644
--- a/bsie/extractor/generic/path.py
+++ b/bsie/extractor/generic/path.py
@@ -4,12 +4,12 @@ Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import os
import typing
# bsie imports
-from bsie.base import extractor
+from bsie.extractor import base
from bsie.utils import bsfs, node, ns
# exports
@@ -20,7 +20,7 @@ __all__: typing.Sequence[str] = (
## code ##
-class Path(extractor.Extractor):
+class Path(base.Extractor):
"""Extract information from file's path."""
CONTENT_READER = 'bsie.reader.path.Path'
@@ -29,7 +29,7 @@ class Path(extractor.Extractor):
_callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]]
def __init__(self):
- super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + '''
bse:filename rdfs:subClassOf bsfs:Predicate ;
rdfs:domain bsfs:File ;
rdfs:range xsd:string ;
diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py
index 0b9ce29..9394456 100644
--- a/bsie/extractor/generic/stat.py
+++ b/bsie/extractor/generic/stat.py
@@ -4,14 +4,16 @@ Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
-# imports
+# standard imports
import os
import typing
# bsie imports
-from bsie.base import extractor
from bsie.utils import bsfs, node, ns
+# inner-module imports
+from .. import base
+
# exports
__all__: typing.Sequence[str] = (
'Stat',
@@ -20,7 +22,7 @@ __all__: typing.Sequence[str] = (
## code ##
-class Stat(extractor.Extractor):
+class Stat(base.Extractor):
"""Extract information from the file system."""
CONTENT_READER = 'bsie.reader.stat.Stat'
@@ -29,7 +31,7 @@ class Stat(extractor.Extractor):
_callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[os.stat_result], typing.Any]]
def __init__(self):
- super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + '''
bse:filesize rdfs:subClassOf bsfs:Predicate ;
rdfs:domain bsfs:File ;
rdfs:range xsd:integer ;
diff --git a/bsie/extractor/image/__init__.py b/bsie/extractor/image/__init__.py
new file mode 100644
index 0000000..75b118d
--- /dev/null
+++ b/bsie/extractor/image/__init__.py
@@ -0,0 +1,13 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# standard imports
+import typing
+
+# exports
+__all__: typing.Sequence[str] = []
+
+## EOF ##
diff --git a/bsie/extractor/image/colors_spatial.py b/bsie/extractor/image/colors_spatial.py
new file mode 100644
index 0000000..ce5b9f2
--- /dev/null
+++ b/bsie/extractor/image/colors_spatial.py
@@ -0,0 +1,154 @@
+"""Spatial color features.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# standard imports
+import typing
+
+# external imports
+import PIL.Image
+import numpy as np
+
+# bsie imports
+from bsie.utils import bsfs, node, ns
+
+# inner-module imports
+from .. import base
+
+# constants
+FEATURE_NAME = ns.bsf + 'ColorsSpatial'
+PREDICATE_NAME = ns.bse + 'colors_spatial'
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'ColorsSpatial',
+ )
+
+
+## code ##
+
+class ColorsSpatial(base.Extractor):
+ """Determine dominant colors of subregions in the image.
+
+ Computes the domiant color of increasingly smaller subregions of the image.
+ """
+
+ CONTENT_READER = 'bsie.reader.image.Image'
+
+ # Initial subregion width.
+ width: int
+
+ # Initial subregion height.
+ height: int
+
+ # Decrement exponent.
+ exp: float
+
+ # Principal predicate's URI.
+ _predicate_name: bsfs.URI
+
+ def __init__(
+ self,
+ width: int = 32,
+ height: int = 32,
+ exp: float = 4.,
+ ):
+ # instance identifier
+ uuid = bsfs.uuid.UCID.from_dict({
+ 'width': width,
+ 'height': height,
+ 'exp': exp,
+ })
+ # determine symbol names
+ instance_name = FEATURE_NAME[uuid]
+ predicate_name = PREDICATE_NAME[uuid]
+ # get vector dimension
+ dimension = self.dimension(width, height, exp)
+ # initialize parent with the schema
+ super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + f'''
+ <{FEATURE_NAME}> rdfs:subClassOf bsfs:Feature ;
+ # annotations
+ rdfs:label "Spatially dominant colors"^^xsd:string ;
+ schema:description "Domiant colors of subregions in an image."^^xsd:string ;
+ bsfs:dtype xsd:integer .
+
+ <{instance_name}> rdfs:subClassOf <{FEATURE_NAME}> ;
+ bsfs:dimension "{dimension}"^^xsd:integer ;
+ # annotations
+ <{FEATURE_NAME}/args#width> "{width}"^^xsd:integer ;
+ <{FEATURE_NAME}/args#height> "{height}"^^xsd:integer ;
+ <{FEATURE_NAME}/args#exp> "{exp}"^^xsd:float .
+
+ <{predicate_name}> rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range <{instance_name}> ;
+ bsfs:unique "true"^^xsd:boolean .
+
+ '''))
+ # assign extra members
+ self.width = width
+ self.height = height
+ self.exp = exp
+ self._predicate_name = predicate_name
+
+ def __repr__(self) -> str:
+ return f'{bsfs.typename(self)}({self.width}, {self.height}, {self.exp})'
+
+ def __eq__(self, other: typing.Any) -> bool:
+ return super().__eq__(other) \
+ and self.width == other.width \
+ and self.height == other.height \
+ and self.exp == other.exp
+
+ def __hash__(self) -> int:
+ return hash((super().__hash__(), self.width, self.height, self.exp))
+
+ @staticmethod
+ def dimension(width: int, height: int, exp: float) -> int:
+ """Return the feature vector dimension."""
+ # FIXME: replace with a proper formula
+ dim = 0
+ while width >= 1 and height >= 1:
+ dim += width * height
+ width = np.floor(width / exp)
+ height = np.floor(height / exp)
+ dim *= 3 # per band
+ return int(dim)
+
+ def extract(
+ self,
+ subject: node.Node,
+ content: PIL.Image,
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ # check principals
+ if self.schema.predicate(self._predicate_name) not in principals:
+ # nothing to do; abort
+ return
+
+ # convert to HSV
+ content = content.convert('HSV')
+
+ # get dimensions
+ width, height = self.width, self.height
+ num_bands = len(content.getbands()) # it's three since we converted to HSV before
+
+ features = []
+ while width >= 1 and height >= 1:
+ # downsample
+ img = content.resize((width, height), resample=PIL.Image.Resampling.BOX)
+ # feature vector
+ features.append(
+ np.array(img.getdata()).reshape((width * height, num_bands)))
+ # iterate
+ width = int(np.floor(width / self.exp))
+ height = int(np.floor(height / self.exp))
+
+ # combine bands and convert features to tuple
+ value = tuple(np.vstack(features).reshape(-1))
+ # return triple with feature vector as value
+ yield subject, self.schema.predicate(self._predicate_name), value
+
+## EOF ##