diff options
author | Matthias Baumgartner <dev@igsor.net> | 2023-01-16 21:43:38 +0100 |
---|---|---|
committer | Matthias Baumgartner <dev@igsor.net> | 2023-01-16 21:43:38 +0100 |
commit | e12cd52ad267563c8046a593ad551b1dd089a702 (patch) | |
tree | d94cdf7ac540eb82630f78cbf564682b66007f51 /bsfs/triple_store | |
parent | 7f5a2920ef311b2077300714d7700313077a0bf6 (diff) | |
parent | 3504609e1ba1f7f653fa79910474bebd3ec24d8a (diff) | |
download | bsfs-e12cd52ad267563c8046a593ad551b1dd089a702.tar.gz bsfs-e12cd52ad267563c8046a593ad551b1dd089a702.tar.bz2 bsfs-e12cd52ad267563c8046a593ad551b1dd089a702.zip |
Merge branch 'mb/features' into develop
Diffstat (limited to 'bsfs/triple_store')
-rw-r--r-- | bsfs/triple_store/sparql/distance.py | 56 | ||||
-rw-r--r-- | bsfs/triple_store/sparql/parse_filter.py | 109 | ||||
-rw-r--r-- | bsfs/triple_store/sparql/sparql.py | 23 |
3 files changed, 143 insertions, 45 deletions
diff --git a/bsfs/triple_store/sparql/distance.py b/bsfs/triple_store/sparql/distance.py new file mode 100644 index 0000000..2f5387a --- /dev/null +++ b/bsfs/triple_store/sparql/distance.py @@ -0,0 +1,56 @@ +""" + +Part of the BlackStar filesystem (bsfs) module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# standard imports +import typing + +# external imports +import numpy as np + +# bsfs imports +from bsfs.namespace import ns + +# constants +EPS = 1e-9 + +# exports +__all__: typing.Sequence[str] = ( + 'DISTANCE_FU', + ) + + +## code ## + +def euclid(fst, snd) -> float: + """Euclidean distance (l2 norm).""" + fst = np.array(fst) + snd = np.array(snd) + return float(np.linalg.norm(fst - snd)) + +def cosine(fst, snd) -> float: + """Cosine distance.""" + fst = np.array(fst) + snd = np.array(snd) + if (fst == snd).all(): + return 0.0 + nrm0 = np.linalg.norm(fst) + nrm1 = np.linalg.norm(snd) + return float(1.0 - np.dot(fst, snd) / (nrm0 * nrm1 + EPS)) + +def manhatten(fst, snd) -> float: + """Manhatten (cityblock) distance (l1 norm).""" + fst = np.array(fst) + snd = np.array(snd) + return float(np.abs(fst - snd).sum()) + +# Known distance functions. +DISTANCE_FU = { + ns.bsfs.euclidean: euclid, + ns.bsfs.cosine: cosine, + ns.bsfs.manhatten: manhatten, +} + +## EOF ## diff --git a/bsfs/triple_store/sparql/parse_filter.py b/bsfs/triple_store/sparql/parse_filter.py index d4db0aa..8b6b976 100644 --- a/bsfs/triple_store/sparql/parse_filter.py +++ b/bsfs/triple_store/sparql/parse_filter.py @@ -5,19 +5,29 @@ A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ # imports +import operator import typing +# external imports +import rdflib + # bsfs imports from bsfs import schema as bsc from bsfs.namespace import ns from bsfs.query import ast from bsfs.utils import URI, errors +# inner-module imports +from .distance import DISTANCE_FU + # exports __all__: typing.Sequence[str] = ( 'Filter', ) + +## code ## + class _GenHopName(): """Generator that produces a new unique symbol name with each iteration.""" @@ -46,10 +56,8 @@ class Filter(): # Generator that produces unique symbol names. ngen: _GenHopName - # Vertex type. - T_VERTEX = typing.Union[bsc.Node, bsc.Literal] - - def __init__(self, schema): + def __init__(self, graph, schema): + self.graph = graph self.schema = schema self.ngen = _GenHopName() @@ -79,7 +87,7 @@ class Filter(): }} ''' - def _parse_filter_expression(self, type_: T_VERTEX, node: ast.filter.FilterExpression, head: str) -> str: + def _parse_filter_expression(self, type_: bsc.Vertex, node: ast.filter.FilterExpression, head: str) -> str: """Route *node* to the handler of the respective FilterExpression subclass.""" if isinstance(node, ast.filter.Is): return self._is(type_, node, head) @@ -87,6 +95,8 @@ class Filter(): return self._not(type_, node, head) if isinstance(node, ast.filter.Has): return self._has(type_, node, head) + if isinstance(node, ast.filter.Distance): + return self._distance(type_, node, head) if isinstance(node, ast.filter.Any): return self._any(type_, node, head) if isinstance(node, ast.filter.All): @@ -112,9 +122,9 @@ class Filter(): def _parse_predicate_expression( self, - type_: T_VERTEX, + type_: bsc.Vertex, node: ast.filter.PredicateExpression - ) -> typing.Tuple[str, T_VERTEX]: + ) -> typing.Tuple[str, bsc.Vertex]: """Route *node* to the handler of the respective PredicateExpression subclass.""" if isinstance(node, ast.filter.Predicate): return self._predicate(type_, node) @@ -123,7 +133,7 @@ class Filter(): # invalid node raise errors.BackendError(f'expected predicate expression, found {node}') - def _one_of(self, node_type: T_VERTEX, node: ast.filter.OneOf) -> typing.Tuple[str, T_VERTEX]: + def _one_of(self, node_type: bsc.Vertex, node: ast.filter.OneOf) -> typing.Tuple[str, bsc.Vertex]: """ """ if not isinstance(node_type, bsc.Node): @@ -134,23 +144,18 @@ class Filter(): puri, subrng = self._parse_predicate_expression(node_type, pred) # track predicate uris suburi.add(puri) - try: - # check for more generic range - if rng is None or subrng > rng: - rng = subrng - # check range consistency - if not subrng <= rng and not subrng >= rng: - raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related') - except TypeError as err: # subrng and rng are not comparable - raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related') from err - if rng is None: - # for mypy to be certain of the rng type - # if rng were None, we'd have gotten a TypeError above (None > None) - raise errors.UnreachableError() + # check for more generic range + if rng is None or subrng > rng: + rng = subrng + # check range consistency + if not subrng <= rng and not subrng >= rng: + raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related') # return joint predicate expression and next range - return '|'.join(suburi), rng + # OneOf guarantees at least one expression, rng is always a bsc.Vertex. + # mypy does not realize this, hence we ignore the warning. + return '|'.join(suburi), rng # type: ignore [return-value] - def _predicate(self, node_type: T_VERTEX, node: ast.filter.Predicate) -> typing.Tuple[str, T_VERTEX]: + def _predicate(self, node_type: bsc.Vertex, node: ast.filter.Predicate) -> typing.Tuple[str, bsc.Vertex]: """ """ # check node_type @@ -162,9 +167,8 @@ class Filter(): if not self.schema.has_predicate(puri): raise errors.ConsistencyError(f'predicate {puri} is not in the schema') pred = self.schema.predicate(puri) - if pred.range is None: - # FIXME: It is a design error that Predicates can have a None range... - raise errors.BackendError(f'predicate {pred} has no range') + if not isinstance(pred.range, (bsc.Node, bsc.Literal)): + raise errors.BackendError(f'the range of predicate {pred} is undefined') dom, rng = pred.domain, pred.range # encapsulate predicate uri puri = f'<{puri}>' # type: ignore [assignment] # variable re-use confuses mypy @@ -178,7 +182,7 @@ class Filter(): # return predicate URI and next node type return puri, rng - def _any(self, node_type: T_VERTEX, node: ast.filter.Any, head: str) -> str: + def _any(self, node_type: bsc.Vertex, node: ast.filter.Any, head: str) -> str: """ """ if not isinstance(node_type, bsc.Node): @@ -191,7 +195,7 @@ class Filter(): # combine results return f'{head} {pred} {nexthead} . {expr}' - def _all(self, node_type: T_VERTEX, node: ast.filter.All, head: str) -> str: + def _all(self, node_type: bsc.Vertex, node: ast.filter.All, head: str) -> str: """ """ # NOTE: All(P, E) := Not(Any(P, Not(E))) and EXISTS(P, ?) @@ -208,13 +212,13 @@ class Filter(): # return existence and rewritten expression return f'FILTER EXISTS {{ {head} {pred} {temphead} }} . ' + expr - def _and(self, node_type: T_VERTEX, node: ast.filter.And, head: str) -> str: + def _and(self, node_type: bsc.Vertex, node: ast.filter.And, head: str) -> str: """ """ sub = [self._parse_filter_expression(node_type, expr, head) for expr in node] return ' . '.join(sub) - def _or(self, node_type: T_VERTEX, node: ast.filter.Or, head: str) -> str: + def _or(self, node_type: bsc.Vertex, node: ast.filter.Or, head: str) -> str: """ """ # potential special case optimization: @@ -224,7 +228,7 @@ class Filter(): sub = ['{' + expr + '}' for expr in sub] return ' UNION '.join(sub) - def _not(self, node_type: T_VERTEX, node: ast.filter.Not, head: str) -> str: + def _not(self, node_type: bsc.Vertex, node: ast.filter.Not, head: str) -> str: """ """ expr = self._parse_filter_expression(node_type, node.expr, head) @@ -235,7 +239,7 @@ class Filter(): # The simplest (and non-interfering) choice is a type statement. return f'MINUS {{ {head} <{ns.rdf.type}>/<{ns.rdfs.subClassOf}>* <{node_type.uri}> . {expr} }}' - def _has(self, node_type: T_VERTEX, node: ast.filter.Has, head: str) -> str: + def _has(self, node_type: bsc.Vertex, node: ast.filter.Has, head: str) -> str: """ """ if not isinstance(node_type, bsc.Node): @@ -248,47 +252,72 @@ class Filter(): # predicate count expression (fetch number of predicates at *head*) num_preds = f'{{ SELECT (COUNT(distinct {inner}) as {outer}) WHERE {{ {head} {pred} {inner} }} }}' # count expression - # FIXME: We have to ensure that ns.xsd.integer is always known in the schema! count_bounds = self._parse_filter_expression(self.schema.literal(ns.xsd.integer), node.count, outer) # combine return num_preds + ' . ' + count_bounds - def _is(self, node_type: T_VERTEX, node: ast.filter.Is, head: str) -> str: + def _distance(self, node_type: bsc.Vertex, node: ast.filter.Distance, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Feature): + raise errors.BackendError(f'expected Feature, found {node_type}') + if len(node.reference) != node_type.dimension: + raise errors.ConsistencyError( + f'reference has dimension {len(node.reference)}, expected {node_type.dimension}') + # get distance metric + dist = DISTANCE_FU[node_type.distance] + # get operator + cmp = operator.lt if node.strict else operator.le + # get candidate values + candidates = { + f'"{cand}"^^<{node_type.uri}>' + for cand + in self.graph.objects() + if isinstance(cand, rdflib.Literal) + and cand.datatype == rdflib.URIRef(node_type.uri) + and cmp(dist(cand.value, node.reference), node.threshold) + } + # combine candidate values + values = ' '.join(candidates) if len(candidates) else f'"impossible value"^^<{ns.xsd.string}>' + # return sparql fragment + return f'VALUES {head} {{ {values} }}' + + def _is(self, node_type: bsc.Vertex, node: ast.filter.Is, head: str) -> str: """ """ if not isinstance(node_type, bsc.Node): raise errors.BackendError(f'expected Node, found {node_type}') return f'VALUES {head} {{ <{node.value}> }}' - def _equals(self, node_type: T_VERTEX, node: ast.filter.Equals, head: str) -> str: + def _equals(self, node_type: bsc.Vertex, node: ast.filter.Equals, head: str) -> str: """ """ if not isinstance(node_type, bsc.Literal): raise errors.BackendError(f'expected Literal, found {node}') return f'VALUES {head} {{ "{node.value}"^^<{node_type.uri}> }}' - def _substring(self, node_type: T_VERTEX, node: ast.filter.Substring, head: str) -> str: + def _substring(self, node_type: bsc.Vertex, node: ast.filter.Substring, head: str) -> str: """ """ if not isinstance(node_type, bsc.Literal): raise errors.BackendError(f'expected Literal, found {node_type}') return f'FILTER contains(str({head}), "{node.value}")' - def _starts_with(self, node_type: T_VERTEX, node: ast.filter.StartsWith, head: str) -> str: + def _starts_with(self, node_type: bsc.Vertex, node: ast.filter.StartsWith, head: str) -> str: """ """ if not isinstance(node_type, bsc.Literal): raise errors.BackendError(f'expected Literal, found {node_type}') return f'FILTER strstarts(str({head}), "{node.value}")' - def _ends_with(self, node_type: T_VERTEX, node: ast.filter.EndsWith, head: str) -> str: + def _ends_with(self, node_type: bsc.Vertex, node: ast.filter.EndsWith, head: str) -> str: """ """ if not isinstance(node_type, bsc.Literal): raise errors.BackendError(f'expected Literal, found {node_type}') return f'FILTER strends(str({head}), "{node.value}")' - def _less_than(self, node_type: T_VERTEX, node: ast.filter.LessThan, head: str) -> str: + def _less_than(self, node_type: bsc.Vertex, node: ast.filter.LessThan, head: str) -> str: """ """ if not isinstance(node_type, bsc.Literal): @@ -296,7 +325,7 @@ class Filter(): equality = '=' if not node.strict else '' return f'FILTER ({head} <{equality} {float(node.threshold)})' - def _greater_than(self, node_type: T_VERTEX, node: ast.filter.GreaterThan, head: str) -> str: + def _greater_than(self, node_type: bsc.Vertex, node: ast.filter.GreaterThan, head: str) -> str: """ """ if not isinstance(node_type, bsc.Literal): diff --git a/bsfs/triple_store/sparql/sparql.py b/bsfs/triple_store/sparql/sparql.py index c3cbff6..fedd227 100644 --- a/bsfs/triple_store/sparql/sparql.py +++ b/bsfs/triple_store/sparql/sparql.py @@ -11,12 +11,14 @@ import rdflib # bsfs imports from bsfs import schema as bsc +from bsfs.namespace import ns from bsfs.query import ast from bsfs.utils import errors, URI # inner-module imports from . import parse_filter from .. import base +from .distance import DISTANCE_FU # exports @@ -94,8 +96,9 @@ class SparqlStore(base.TripleStoreBase): super().__init__(None) self._graph = rdflib.Graph() self._transaction = _Transaction(self._graph) - self._schema = bsc.Schema.Empty() - self._filter_parser = parse_filter.Filter(self._schema) + # NOTE: parsing bsfs.query.ast.filter.Has requires xsd:integer. + self._schema = bsc.Schema(literals={bsc.ROOT_NUMBER.child(ns.xsd.integer)}) + self._filter_parser = parse_filter.Filter(self._graph, self._schema) # NOTE: mypy and pylint complain about the **kwargs not being listed (contrasting super) # However, not having it here is clearer since it's explicit that there are no arguments. @@ -121,6 +124,16 @@ class SparqlStore(base.TripleStoreBase): # check compatibility: No contradicting definitions if not self.schema.consistent_with(schema): raise errors.ConsistencyError(f'{schema} is inconsistent with {self.schema}') + # check distance functions of features + invalid = { + (cand.uri, cand.distance) + for cand + in schema.literals() + if isinstance(cand, bsc.Feature) and cand.distance not in DISTANCE_FU} + if len(invalid) > 0: + cand, dist = zip(*invalid) + raise errors.UnsupportedError( + f'unknown distance function {",".join(dist)} in feature {", ".join(cand)}') # commit the current transaction self.commit() @@ -137,7 +150,7 @@ class SparqlStore(base.TripleStoreBase): for src, trg in self._graph.subject_objects(rdflib.URIRef(pred.uri)): self._transaction.remove((src, rdflib.URIRef(pred.uri), trg)) # remove predicate definition - if pred.parent is not None: + if pred.parent is not None: # NOTE: there shouldn't be any predicate w/o parent self._transaction.remove(( rdflib.URIRef(pred.uri), rdflib.RDFS.subClassOf, @@ -157,7 +170,7 @@ class SparqlStore(base.TripleStoreBase): # remove instance self._transaction.remove((inst, rdflib.RDF.type, rdflib.URIRef(node.uri))) # remove node definition - if node.parent is not None: + if node.parent is not None: # NOTE: there shouldn't be any node w/o parent self._transaction.remove(( rdflib.URIRef(node.uri), rdflib.RDFS.subClassOf, @@ -166,7 +179,7 @@ class SparqlStore(base.TripleStoreBase): for lit in sub.literals: # remove literal definition - if lit.parent is not None: + if lit.parent is not None: # NOTE: there shouldn't be any literal w/o parent self._transaction.remove(( rdflib.URIRef(lit.uri), rdflib.RDFS.subClassOf, |