Merge branch 'mb/features' into develop

author: Matthias Baumgartner <dev@igsor.net> 2023-01-16 21:43:38 +0100
committer: Matthias Baumgartner <dev@igsor.net> 2023-01-16 21:43:38 +0100
commit: e12cd52ad267563c8046a593ad551b1dd089a702 (patch)
tree: d94cdf7ac540eb82630f78cbf564682b66007f51 /bsfs/triple_store
parent: 7f5a2920ef311b2077300714d7700313077a0bf6 (diff)
parent: 3504609e1ba1f7f653fa79910474bebd3ec24d8a (diff)
download: bsfs-e12cd52ad267563c8046a593ad551b1dd089a702.tar.gz
bsfs-e12cd52ad267563c8046a593ad551b1dd089a702.tar.bz2
bsfs-e12cd52ad267563c8046a593ad551b1dd089a702.zip
3 files changed, 143 insertions, 45 deletions
diff --git a/bsfs/triple_store/sparql/distance.py b/bsfs/triple_store/sparql/distance.py
new file mode 100644
index 0000000..2f5387a
--- /dev/null
+++ b/bsfs/triple_store/sparql/distance.py
@@ -0,0 +1,56 @@
+"""
+
+Part of the BlackStar filesystem (bsfs) module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# standard imports
+import typing
+
+# external imports
+import numpy as np
+
+# bsfs imports
+from bsfs.namespace import ns
+
+# constants
+EPS = 1e-9
+
+# exports
+__all__: typing.Sequence[str] = (
+    'DISTANCE_FU',
+    )
+
+
+## code ##
+
+def euclid(fst, snd) -> float:
+    """Euclidean distance (l2 norm)."""
+    fst = np.array(fst)
+    snd = np.array(snd)
+    return float(np.linalg.norm(fst - snd))
+
+def cosine(fst, snd) -> float:
+    """Cosine distance."""
+    fst = np.array(fst)
+    snd = np.array(snd)
+    if (fst == snd).all():
+        return 0.0
+    nrm0 = np.linalg.norm(fst)
+    nrm1 = np.linalg.norm(snd)
+    return float(1.0 - np.dot(fst, snd) / (nrm0 * nrm1 + EPS))
+
+def manhatten(fst, snd) -> float:
+    """Manhatten (cityblock) distance (l1 norm)."""
+    fst = np.array(fst)
+    snd = np.array(snd)
+    return float(np.abs(fst - snd).sum())
+
+# Known distance functions.
+DISTANCE_FU = {
+    ns.bsfs.euclidean: euclid,
+    ns.bsfs.cosine: cosine,
+    ns.bsfs.manhatten: manhatten,
+}
+
+## EOF ##
diff --git a/bsfs/triple_store/sparql/parse_filter.py b/bsfs/triple_store/sparql/parse_filter.py
index d4db0aa..8b6b976 100644
--- a/bsfs/triple_store/sparql/parse_filter.py
+++ b/bsfs/triple_store/sparql/parse_filter.py
@@ -5,19 +5,29 @@ A copy of the license is provided with the project.
 Author: Matthias Baumgartner, 2022
 """
 # imports
+import operator
 import typing
 
+# external imports
+import rdflib
+
 # bsfs imports
 from bsfs import schema as bsc
 from bsfs.namespace import ns
 from bsfs.query import ast
 from bsfs.utils import URI, errors
 
+# inner-module imports
+from .distance import DISTANCE_FU
+
 # exports
 __all__: typing.Sequence[str] = (
     'Filter',
     )
 
+
+## code ##
+
 class _GenHopName():
     """Generator that produces a new unique symbol name with each iteration."""
 
@@ -46,10 +56,8 @@ class Filter():
     # Generator that produces unique symbol names.
     ngen: _GenHopName
 
-    # Vertex type.
-    T_VERTEX = typing.Union[bsc.Node, bsc.Literal]
-
-    def __init__(self, schema):
+    def __init__(self, graph, schema):
+        self.graph = graph
         self.schema = schema
         self.ngen = _GenHopName()
 
@@ -79,7 +87,7 @@ class Filter():
             }}
             '''
 
-    def _parse_filter_expression(self, type_: T_VERTEX, node: ast.filter.FilterExpression, head: str) -> str:
+    def _parse_filter_expression(self, type_: bsc.Vertex, node: ast.filter.FilterExpression, head: str) -> str:
         """Route *node* to the handler of the respective FilterExpression subclass."""
         if isinstance(node, ast.filter.Is):
             return self._is(type_, node, head)
@@ -87,6 +95,8 @@ class Filter():
             return self._not(type_, node, head)
         if isinstance(node, ast.filter.Has):
             return self._has(type_, node, head)
+        if isinstance(node, ast.filter.Distance):
+            return self._distance(type_, node, head)
         if isinstance(node, ast.filter.Any):
             return self._any(type_, node, head)
         if isinstance(node, ast.filter.All):
@@ -112,9 +122,9 @@ class Filter():
 
     def _parse_predicate_expression(
             self,
-            type_: T_VERTEX,
+            type_: bsc.Vertex,
             node: ast.filter.PredicateExpression
-            ) -> typing.Tuple[str, T_VERTEX]:
+            ) -> typing.Tuple[str, bsc.Vertex]:
         """Route *node* to the handler of the respective PredicateExpression subclass."""
         if isinstance(node, ast.filter.Predicate):
             return self._predicate(type_, node)
@@ -123,7 +133,7 @@ class Filter():
         # invalid node
         raise errors.BackendError(f'expected predicate expression, found {node}')
 
-    def _one_of(self, node_type: T_VERTEX, node: ast.filter.OneOf) -> typing.Tuple[str, T_VERTEX]:
+    def _one_of(self, node_type: bsc.Vertex, node: ast.filter.OneOf) -> typing.Tuple[str, bsc.Vertex]:
         """
         """
         if not isinstance(node_type, bsc.Node):
@@ -134,23 +144,18 @@ class Filter():
             puri, subrng = self._parse_predicate_expression(node_type, pred)
             # track predicate uris
             suburi.add(puri)
-            try:
-                # check for more generic range
-                if rng is None or subrng > rng:
-                    rng = subrng
-                # check range consistency
-                if not subrng <= rng and not subrng >= rng:
-                    raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related')
-            except TypeError as err: # subrng and rng are not comparable
-                raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related') from err
-        if rng is None:
-            # for mypy to be certain of the rng type
-            # if rng were None, we'd have gotten a TypeError above (None > None)
-            raise errors.UnreachableError()
+            # check for more generic range
+            if rng is None or subrng > rng:
+                rng = subrng
+            # check range consistency
+            if not subrng <= rng and not subrng >= rng:
+                raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related')
         # return joint predicate expression and next range
-        return '|'.join(suburi), rng
+        # OneOf guarantees at least one expression, rng is always a bsc.Vertex.
+        # mypy does not realize this, hence we ignore the warning.
+        return '|'.join(suburi), rng # type: ignore [return-value]
 
-    def _predicate(self, node_type: T_VERTEX, node: ast.filter.Predicate) -> typing.Tuple[str, T_VERTEX]:
+    def _predicate(self, node_type: bsc.Vertex, node: ast.filter.Predicate) -> typing.Tuple[str, bsc.Vertex]:
         """
         """
         # check node_type
@@ -162,9 +167,8 @@ class Filter():
         if not self.schema.has_predicate(puri):
             raise errors.ConsistencyError(f'predicate {puri} is not in the schema')
         pred = self.schema.predicate(puri)
-        if pred.range is None:
-            # FIXME: It is a design error that Predicates can have a None range...
-            raise errors.BackendError(f'predicate {pred} has no range')
+        if not isinstance(pred.range, (bsc.Node, bsc.Literal)):
+            raise errors.BackendError(f'the range of predicate {pred} is undefined')
         dom, rng = pred.domain, pred.range
         # encapsulate predicate uri
         puri = f'<{puri}>' # type: ignore [assignment] # variable re-use confuses mypy
@@ -178,7 +182,7 @@ class Filter():
         # return predicate URI and next node type
         return puri, rng
 
-    def _any(self, node_type: T_VERTEX, node: ast.filter.Any, head: str) -> str:
+    def _any(self, node_type: bsc.Vertex, node: ast.filter.Any, head: str) -> str:
         """
         """
         if not isinstance(node_type, bsc.Node):
@@ -191,7 +195,7 @@ class Filter():
         # combine results
         return f'{head} {pred} {nexthead} . {expr}'
 
-    def _all(self, node_type: T_VERTEX, node: ast.filter.All, head: str) -> str:
+    def _all(self, node_type: bsc.Vertex, node: ast.filter.All, head: str) -> str:
         """
         """
         # NOTE: All(P, E) := Not(Any(P, Not(E))) and EXISTS(P, ?)
@@ -208,13 +212,13 @@ class Filter():
         # return existence and rewritten expression
         return f'FILTER EXISTS {{ {head} {pred} {temphead} }} . ' + expr
 
-    def _and(self, node_type: T_VERTEX, node: ast.filter.And, head: str) -> str:
+    def _and(self, node_type: bsc.Vertex, node: ast.filter.And, head: str) -> str:
         """
         """
         sub = [self._parse_filter_expression(node_type, expr, head) for expr in node]
         return ' . '.join(sub)
 
-    def _or(self, node_type: T_VERTEX, node: ast.filter.Or, head: str) -> str:
+    def _or(self, node_type: bsc.Vertex, node: ast.filter.Or, head: str) -> str:
         """
         """
         # potential special case optimization:
@@ -224,7 +228,7 @@ class Filter():
         sub = ['{' + expr + '}' for expr in sub]
         return ' UNION '.join(sub)
 
-    def _not(self, node_type: T_VERTEX, node: ast.filter.Not, head: str) -> str:
+    def _not(self, node_type: bsc.Vertex, node: ast.filter.Not, head: str) -> str:
         """
         """
         expr = self._parse_filter_expression(node_type, node.expr, head)
@@ -235,7 +239,7 @@ class Filter():
         # The simplest (and non-interfering) choice is a type statement.
         return f'MINUS {{ {head} <{ns.rdf.type}>/<{ns.rdfs.subClassOf}>* <{node_type.uri}> . {expr} }}'
 
-    def _has(self, node_type: T_VERTEX, node: ast.filter.Has, head: str) -> str:
+    def _has(self, node_type: bsc.Vertex, node: ast.filter.Has, head: str) -> str:
         """
         """
         if not isinstance(node_type, bsc.Node):
@@ -248,47 +252,72 @@ class Filter():
         # predicate count expression (fetch number of predicates at *head*)
         num_preds = f'{{ SELECT (COUNT(distinct {inner}) as {outer}) WHERE {{ {head} {pred} {inner} }} }}'
         # count expression
-        # FIXME: We have to ensure that ns.xsd.integer is always known in the schema!
         count_bounds = self._parse_filter_expression(self.schema.literal(ns.xsd.integer), node.count, outer)
         # combine
         return num_preds + ' . ' + count_bounds
 
-    def _is(self, node_type: T_VERTEX, node: ast.filter.Is, head: str) -> str:
+    def _distance(self, node_type: bsc.Vertex, node: ast.filter.Distance, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Feature):
+            raise errors.BackendError(f'expected Feature, found {node_type}')
+        if len(node.reference) != node_type.dimension:
+            raise errors.ConsistencyError(
+                f'reference has dimension {len(node.reference)}, expected {node_type.dimension}')
+        # get distance metric
+        dist = DISTANCE_FU[node_type.distance]
+        # get operator
+        cmp = operator.lt if node.strict else operator.le
+        # get candidate values
+        candidates = {
+            f'"{cand}"^^<{node_type.uri}>'
+            for cand
+            in self.graph.objects()
+            if isinstance(cand, rdflib.Literal)
+            and cand.datatype == rdflib.URIRef(node_type.uri)
+            and cmp(dist(cand.value, node.reference), node.threshold)
+            }
+        # combine candidate values
+        values = ' '.join(candidates) if len(candidates) else f'"impossible value"^^<{ns.xsd.string}>'
+        # return sparql fragment
+        return f'VALUES {head} {{ {values} }}'
+
+    def _is(self, node_type: bsc.Vertex, node: ast.filter.Is, head: str) -> str:
         """
         """
         if not isinstance(node_type, bsc.Node):
             raise errors.BackendError(f'expected Node, found {node_type}')
         return f'VALUES {head} {{ <{node.value}> }}'
 
-    def _equals(self, node_type: T_VERTEX, node: ast.filter.Equals, head: str) -> str:
+    def _equals(self, node_type: bsc.Vertex, node: ast.filter.Equals, head: str) -> str:
         """
         """
         if not isinstance(node_type, bsc.Literal):
             raise errors.BackendError(f'expected Literal, found {node}')
         return f'VALUES {head} {{ "{node.value}"^^<{node_type.uri}> }}'
 
-    def _substring(self, node_type: T_VERTEX, node: ast.filter.Substring, head: str) -> str:
+    def _substring(self, node_type: bsc.Vertex, node: ast.filter.Substring, head: str) -> str:
         """
         """
         if not isinstance(node_type, bsc.Literal):
             raise errors.BackendError(f'expected Literal, found {node_type}')
         return f'FILTER contains(str({head}), "{node.value}")'
 
-    def _starts_with(self, node_type: T_VERTEX, node: ast.filter.StartsWith, head: str) -> str:
+    def _starts_with(self, node_type: bsc.Vertex, node: ast.filter.StartsWith, head: str) -> str:
         """
         """
         if not isinstance(node_type, bsc.Literal):
             raise errors.BackendError(f'expected Literal, found {node_type}')
         return f'FILTER strstarts(str({head}), "{node.value}")'
 
-    def _ends_with(self, node_type: T_VERTEX, node: ast.filter.EndsWith, head: str) -> str:
+    def _ends_with(self, node_type: bsc.Vertex, node: ast.filter.EndsWith, head: str) -> str:
         """
         """
         if not isinstance(node_type, bsc.Literal):
             raise errors.BackendError(f'expected Literal, found {node_type}')
         return f'FILTER strends(str({head}), "{node.value}")'
 
-    def _less_than(self, node_type: T_VERTEX, node: ast.filter.LessThan, head: str) -> str:
+    def _less_than(self, node_type: bsc.Vertex, node: ast.filter.LessThan, head: str) -> str:
         """
         """
         if not isinstance(node_type, bsc.Literal):
@@ -296,7 +325,7 @@ class Filter():
         equality = '=' if not node.strict else ''
         return f'FILTER ({head} <{equality} {float(node.threshold)})'
 
-    def _greater_than(self, node_type: T_VERTEX, node: ast.filter.GreaterThan, head: str) -> str:
+    def _greater_than(self, node_type: bsc.Vertex, node: ast.filter.GreaterThan, head: str) -> str:
         """
         """
         if not isinstance(node_type, bsc.Literal):
diff --git a/bsfs/triple_store/sparql/sparql.py b/bsfs/triple_store/sparql/sparql.py
index c3cbff6..fedd227 100644
--- a/bsfs/triple_store/sparql/sparql.py
+++ b/bsfs/triple_store/sparql/sparql.py
@@ -11,12 +11,14 @@ import rdflib
 
 # bsfs imports
 from bsfs import schema as bsc
+from bsfs.namespace import ns
 from bsfs.query import ast
 from bsfs.utils import errors, URI
 
 # inner-module imports
 from . import parse_filter
 from .. import base
+from .distance import DISTANCE_FU
 
 
 # exports
@@ -94,8 +96,9 @@ class SparqlStore(base.TripleStoreBase):
         super().__init__(None)
         self._graph = rdflib.Graph()
         self._transaction = _Transaction(self._graph)
-        self._schema = bsc.Schema.Empty()
-        self._filter_parser = parse_filter.Filter(self._schema)
+        # NOTE: parsing bsfs.query.ast.filter.Has requires xsd:integer.
+        self._schema = bsc.Schema(literals={bsc.ROOT_NUMBER.child(ns.xsd.integer)})
+        self._filter_parser = parse_filter.Filter(self._graph, self._schema)
 
     # NOTE: mypy and pylint complain about the **kwargs not being listed (contrasting super)
     # However, not having it here is clearer since it's explicit that there are no arguments.
@@ -121,6 +124,16 @@ class SparqlStore(base.TripleStoreBase):
         # check compatibility: No contradicting definitions
         if not self.schema.consistent_with(schema):
             raise errors.ConsistencyError(f'{schema} is inconsistent with {self.schema}')
+        # check distance functions of features
+        invalid = {
+            (cand.uri, cand.distance)
+            for cand
+            in schema.literals()
+            if isinstance(cand, bsc.Feature) and cand.distance not in DISTANCE_FU}
+        if len(invalid) > 0:
+            cand, dist = zip(*invalid)
+            raise errors.UnsupportedError(
+                f'unknown distance function {",".join(dist)} in feature {", ".join(cand)}')
 
         # commit the current transaction
         self.commit()
@@ -137,7 +150,7 @@ class SparqlStore(base.TripleStoreBase):
             for src, trg in self._graph.subject_objects(rdflib.URIRef(pred.uri)):
                 self._transaction.remove((src, rdflib.URIRef(pred.uri), trg))
             # remove predicate definition
-            if pred.parent is not None:
+            if pred.parent is not None: # NOTE: there shouldn't be any predicate w/o parent
                 self._transaction.remove((
                     rdflib.URIRef(pred.uri),
                     rdflib.RDFS.subClassOf,
@@ -157,7 +170,7 @@ class SparqlStore(base.TripleStoreBase):
                 # remove instance
                 self._transaction.remove((inst, rdflib.RDF.type, rdflib.URIRef(node.uri)))
             # remove node definition
-            if node.parent is not None:
+            if node.parent is not None: # NOTE: there shouldn't be any node w/o parent
                 self._transaction.remove((
                     rdflib.URIRef(node.uri),
                     rdflib.RDFS.subClassOf,
@@ -166,7 +179,7 @@ class SparqlStore(base.TripleStoreBase):
 
         for lit in sub.literals:
             # remove literal definition
-            if lit.parent is not None:
+            if lit.parent is not None: # NOTE: there shouldn't be any literal w/o parent
                 self._transaction.remove((
                     rdflib.URIRef(lit.uri),
                     rdflib.RDFS.subClassOf,
author	Matthias Baumgartner <dev@igsor.net>	2023-01-16 21:43:38 +0100
committer	Matthias Baumgartner <dev@igsor.net>	2023-01-16 21:43:38 +0100
commit	e12cd52ad267563c8046a593ad551b1dd089a702 (patch)
tree	d94cdf7ac540eb82630f78cbf564682b66007f51 /bsfs/triple_store
parent	7f5a2920ef311b2077300714d7700313077a0bf6 (diff)
parent	3504609e1ba1f7f653fa79910474bebd3ec24d8a (diff)
download	bsfs-e12cd52ad267563c8046a593ad551b1dd089a702.tar.gz bsfs-e12cd52ad267563c8046a593ad551b1dd089a702.tar.bz2 bsfs-e12cd52ad267563c8046a593ad551b1dd089a702.zip