3 files changed, 357 insertions, 7 deletions
diff --git a/bsfs/triple_store/base.py b/bsfs/triple_store/base.py
index 5ff9523..7e03714 100644
--- a/bsfs/triple_store/base.py
+++ b/bsfs/triple_store/base.py
@@ -113,9 +113,11 @@ class TripleStoreBase(abc.ABC):
     def get(
             self,
             node_type: _schema.Node,
-            query: ast.filter.FilterExpression,
+            query: typing.Optional[ast.filter.FilterExpression] = None,
             ) -> typing.Iterator[URI]:
-        """Return guids of nodes of type *node_type* that match the *query*."""
+        """Return guids of nodes of type *node_type* that match the *query*.
+        Return all guids of the respective type if *query* is None.
+        """
 
     @abc.abstractmethod
     def exists(
diff --git a/bsfs/triple_store/sparql/parse_filter.py b/bsfs/triple_store/sparql/parse_filter.py
new file mode 100644
index 0000000..d4db0aa
--- /dev/null
+++ b/bsfs/triple_store/sparql/parse_filter.py
@@ -0,0 +1,307 @@
+"""
+
+Part of the BlackStar filesystem (bsfs) module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsfs imports
+from bsfs import schema as bsc
+from bsfs.namespace import ns
+from bsfs.query import ast
+from bsfs.utils import URI, errors
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Filter',
+    )
+
+class _GenHopName():
+    """Generator that produces a new unique symbol name with each iteration."""
+
+    # Symbol name prefix.
+    prefix: str
+
+    # Current counter.
+    curr: int
+
+    def __init__(self, prefix: str = '?hop', start: int = 0):
+        self.prefix = prefix
+        self.curr = start - 1
+
+    def __next__(self):
+        """Generate and return the next unique name."""
+        self.curr += 1
+        return self.prefix + str(self.curr)
+
+
+class Filter():
+    """Translate `bsfs.query.ast.filter` structures into Sparql queries."""
+
+    # Current schema to validate against.
+    schema: bsc.Schema
+
+    # Generator that produces unique symbol names.
+    ngen: _GenHopName
+
+    # Vertex type.
+    T_VERTEX = typing.Union[bsc.Node, bsc.Literal]
+
+    def __init__(self, schema):
+        self.schema = schema
+        self.ngen = _GenHopName()
+
+    def __call__(
+            self,
+            root_type: bsc.Node,
+            root: typing.Optional[ast.filter.FilterExpression] = None,
+            ) -> str:
+        """
+        """
+        # check root_type
+        if not isinstance(root_type, bsc.Node):
+            raise errors.BackendError(f'expected Node, found {root_type}')
+        if root_type not in self.schema.nodes():
+            raise errors.ConsistencyError(f'node {root_type} is not in the schema')
+        # parse root
+        if root is None:
+            cond = ''
+        else:
+            cond = self._parse_filter_expression(root_type, root, '?ent')
+        # assemble query
+        return f'''
+            SELECT ?ent
+            WHERE {{
+                ?ent <{ns.rdf.type}>/<{ns.rdfs.subClassOf}>* <{root_type.uri}> .
+                {cond}
+            }}
+            '''
+
+    def _parse_filter_expression(self, type_: T_VERTEX, node: ast.filter.FilterExpression, head: str) -> str:
+        """Route *node* to the handler of the respective FilterExpression subclass."""
+        if isinstance(node, ast.filter.Is):
+            return self._is(type_, node, head)
+        if isinstance(node, ast.filter.Not):
+            return self._not(type_, node, head)
+        if isinstance(node, ast.filter.Has):
+            return self._has(type_, node, head)
+        if isinstance(node, ast.filter.Any):
+            return self._any(type_, node, head)
+        if isinstance(node, ast.filter.All):
+            return self._all(type_, node, head)
+        if isinstance(node, ast.filter.And):
+            return self._and(type_, node, head)
+        if isinstance(node, ast.filter.Or):
+            return self._or(type_, node, head)
+        if isinstance(node, ast.filter.Equals):
+            return self._equals(type_, node, head)
+        if isinstance(node, ast.filter.Substring):
+            return self._substring(type_, node, head)
+        if isinstance(node, ast.filter.StartsWith):
+            return self._starts_with(type_, node, head)
+        if isinstance(node, ast.filter.EndsWith):
+            return self._ends_with(type_, node, head)
+        if isinstance(node, ast.filter.LessThan):
+            return self._less_than(type_, node, head)
+        if isinstance(node, ast.filter.GreaterThan):
+            return self._greater_than(type_, node, head)
+        # invalid node
+        raise errors.BackendError(f'expected filter expression, found {node}')
+
+    def _parse_predicate_expression(
+            self,
+            type_: T_VERTEX,
+            node: ast.filter.PredicateExpression
+            ) -> typing.Tuple[str, T_VERTEX]:
+        """Route *node* to the handler of the respective PredicateExpression subclass."""
+        if isinstance(node, ast.filter.Predicate):
+            return self._predicate(type_, node)
+        if isinstance(node, ast.filter.OneOf):
+            return self._one_of(type_, node)
+        # invalid node
+        raise errors.BackendError(f'expected predicate expression, found {node}')
+
+    def _one_of(self, node_type: T_VERTEX, node: ast.filter.OneOf) -> typing.Tuple[str, T_VERTEX]:
+        """
+        """
+        if not isinstance(node_type, bsc.Node):
+            raise errors.BackendError(f'expected Node, found {node_type}')
+        # walk through predicates
+        suburi, rng = set(), None
+        for pred in node: # OneOf guarantees at least one expression
+            puri, subrng = self._parse_predicate_expression(node_type, pred)
+            # track predicate uris
+            suburi.add(puri)
+            try:
+                # check for more generic range
+                if rng is None or subrng > rng:
+                    rng = subrng
+                # check range consistency
+                if not subrng <= rng and not subrng >= rng:
+                    raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related')
+            except TypeError as err: # subrng and rng are not comparable
+                raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related') from err
+        if rng is None:
+            # for mypy to be certain of the rng type
+            # if rng were None, we'd have gotten a TypeError above (None > None)
+            raise errors.UnreachableError()
+        # return joint predicate expression and next range
+        return '|'.join(suburi), rng
+
+    def _predicate(self, node_type: T_VERTEX, node: ast.filter.Predicate) -> typing.Tuple[str, T_VERTEX]:
+        """
+        """
+        # check node_type
+        if not isinstance(node_type, bsc.Node):
+            raise errors.BackendError(f'expected Node, found {node_type}')
+        # fetch predicate and its uri
+        puri = node.predicate
+        # get and check predicate, domain, and range
+        if not self.schema.has_predicate(puri):
+            raise errors.ConsistencyError(f'predicate {puri} is not in the schema')
+        pred = self.schema.predicate(puri)
+        if pred.range is None:
+            # FIXME: It is a design error that Predicates can have a None range...
+            raise errors.BackendError(f'predicate {pred} has no range')
+        dom, rng = pred.domain, pred.range
+        # encapsulate predicate uri
+        puri = f'<{puri}>' # type: ignore [assignment] # variable re-use confuses mypy
+        # apply reverse flag
+        if node.reverse:
+            puri = URI('^' + puri)
+            dom, rng = rng, dom # type: ignore [assignment] # variable re-use confuses mypy
+        # check path consistency
+        if not node_type <= dom:
+            raise errors.ConsistencyError(f'expected type {dom} or subtype thereof, found {node_type}')
+        # return predicate URI and next node type
+        return puri, rng
+
+    def _any(self, node_type: T_VERTEX, node: ast.filter.Any, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Node):
+            raise errors.BackendError(f'expected Node, found {node_type}')
+        # parse predicate
+        pred, next_type = self._parse_predicate_expression(node_type, node.predicate)
+        # parse expression
+        nexthead = next(self.ngen)
+        expr = self._parse_filter_expression(next_type, node.expr, nexthead)
+        # combine results
+        return f'{head} {pred} {nexthead} . {expr}'
+
+    def _all(self, node_type: T_VERTEX, node: ast.filter.All, head: str) -> str:
+        """
+        """
+        # NOTE: All(P, E) := Not(Any(P, Not(E))) and EXISTS(P, ?)
+        if not isinstance(node_type, bsc.Node):
+            raise errors.BackendError(f'expected Node, found {node_type}')
+        # parse rewritten ast
+        expr = self._parse_filter_expression(node_type,
+            ast.filter.Not(
+                ast.filter.Any(node.predicate,
+                    ast.filter.Not(node.expr))), head)
+        # parse predicate for existence constraint
+        pred, _ = self._parse_predicate_expression(node_type, node.predicate)
+        temphead = next(self.ngen)
+        # return existence and rewritten expression
+        return f'FILTER EXISTS {{ {head} {pred} {temphead} }} . ' + expr
+
+    def _and(self, node_type: T_VERTEX, node: ast.filter.And, head: str) -> str:
+        """
+        """
+        sub = [self._parse_filter_expression(node_type, expr, head) for expr in node]
+        return ' . '.join(sub)
+
+    def _or(self, node_type: T_VERTEX, node: ast.filter.Or, head: str) -> str:
+        """
+        """
+        # potential special case optimization:
+        # * ast: Or(Equals('foo'), Equals('bar'), ...)
+        # * query: VALUES ?head { "value1"^^<...> "value2"^^<...> "value3"^<...> ... }
+        sub = [self._parse_filter_expression(node_type, expr, head) for expr in node]
+        sub = ['{' + expr + '}' for expr in sub]
+        return ' UNION '.join(sub)
+
+    def _not(self, node_type: T_VERTEX, node: ast.filter.Not, head: str) -> str:
+        """
+        """
+        expr = self._parse_filter_expression(node_type, node.expr, head)
+        if isinstance(node_type, bsc.Literal):
+            return f'MINUS {{ {expr} }}'
+        # NOTE: for bsc.Node types, we must include at least one expression in the body of MINUS,
+        # otherwise the connection between the context and body of MINUS is lost.
+        # The simplest (and non-interfering) choice is a type statement.
+        return f'MINUS {{ {head} <{ns.rdf.type}>/<{ns.rdfs.subClassOf}>* <{node_type.uri}> . {expr} }}'
+
+    def _has(self, node_type: T_VERTEX, node: ast.filter.Has, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Node):
+            raise errors.BackendError(f'expected Node, found {node_type}')
+        # parse predicate
+        pred, _ = self._parse_predicate_expression(node_type, node.predicate)
+        # get new heads
+        inner = next(self.ngen)
+        outer = next(self.ngen)
+        # predicate count expression (fetch number of predicates at *head*)
+        num_preds = f'{{ SELECT (COUNT(distinct {inner}) as {outer}) WHERE {{ {head} {pred} {inner} }} }}'
+        # count expression
+        # FIXME: We have to ensure that ns.xsd.integer is always known in the schema!
+        count_bounds = self._parse_filter_expression(self.schema.literal(ns.xsd.integer), node.count, outer)
+        # combine
+        return num_preds + ' . ' + count_bounds
+
+    def _is(self, node_type: T_VERTEX, node: ast.filter.Is, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Node):
+            raise errors.BackendError(f'expected Node, found {node_type}')
+        return f'VALUES {head} {{ <{node.value}> }}'
+
+    def _equals(self, node_type: T_VERTEX, node: ast.filter.Equals, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Literal):
+            raise errors.BackendError(f'expected Literal, found {node}')
+        return f'VALUES {head} {{ "{node.value}"^^<{node_type.uri}> }}'
+
+    def _substring(self, node_type: T_VERTEX, node: ast.filter.Substring, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Literal):
+            raise errors.BackendError(f'expected Literal, found {node_type}')
+        return f'FILTER contains(str({head}), "{node.value}")'
+
+    def _starts_with(self, node_type: T_VERTEX, node: ast.filter.StartsWith, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Literal):
+            raise errors.BackendError(f'expected Literal, found {node_type}')
+        return f'FILTER strstarts(str({head}), "{node.value}")'
+
+    def _ends_with(self, node_type: T_VERTEX, node: ast.filter.EndsWith, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Literal):
+            raise errors.BackendError(f'expected Literal, found {node_type}')
+        return f'FILTER strends(str({head}), "{node.value}")'
+
+    def _less_than(self, node_type: T_VERTEX, node: ast.filter.LessThan, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Literal):
+            raise errors.BackendError(f'expected Literal, found {node_type}')
+        equality = '=' if not node.strict else ''
+        return f'FILTER ({head} <{equality} {float(node.threshold)})'
+
+    def _greater_than(self, node_type: T_VERTEX, node: ast.filter.GreaterThan, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Literal):
+            raise errors.BackendError(f'expected Literal, found {node_type}')
+        equality = '=' if not node.strict else ''
+        return f'FILTER ({head} >{equality} {float(node.threshold)})'
+
+## EOF ##
diff --git a/bsfs/triple_store/sparql/sparql.py b/bsfs/triple_store/sparql/sparql.py
index 7172f34..c3cbff6 100644
--- a/bsfs/triple_store/sparql/sparql.py
+++ b/bsfs/triple_store/sparql/sparql.py
@@ -15,6 +15,7 @@ from bsfs.query import ast
 from bsfs.utils import errors, URI
 
 # inner-module imports
+from . import parse_filter
 from .. import base
 
 
@@ -86,11 +87,15 @@ class SparqlStore(base.TripleStoreBase):
     # The local schema.
     _schema: bsc.Schema
 
+    # Filter parser
+    _filter_parser: parse_filter.Filter
+
     def __init__(self):
         super().__init__(None)
         self._graph = rdflib.Graph()
         self._transaction = _Transaction(self._graph)
         self._schema = bsc.Schema.Empty()
+        self._filter_parser = parse_filter.Filter(self._schema)
 
     # NOTE: mypy and pylint complain about the **kwargs not being listed (contrasting super)
     # However, not having it here is clearer since it's explicit that there are no arguments.
@@ -127,10 +132,17 @@ class SparqlStore(base.TripleStoreBase):
         # get deleted classes
         sub = self.schema - schema
 
-        # remove predicate instances
         for pred in sub.predicates:
+            # remove predicate instances
             for src, trg in self._graph.subject_objects(rdflib.URIRef(pred.uri)):
                 self._transaction.remove((src, rdflib.URIRef(pred.uri), trg))
+            # remove predicate definition
+            if pred.parent is not None:
+                self._transaction.remove((
+                    rdflib.URIRef(pred.uri),
+                    rdflib.RDFS.subClassOf,
+                    rdflib.URIRef(pred.parent.uri),
+                    ))
 
         # remove node instances
         for node in sub.nodes:
@@ -144,17 +156,46 @@ class SparqlStore(base.TripleStoreBase):
                     self._transaction.remove((inst, pred, trg))
                 # remove instance
                 self._transaction.remove((inst, rdflib.RDF.type, rdflib.URIRef(node.uri)))
-
-        # NOTE: Nothing to do for literals
+            # remove node definition
+            if node.parent is not None:
+                self._transaction.remove((
+                    rdflib.URIRef(node.uri),
+                    rdflib.RDFS.subClassOf,
+                    rdflib.URIRef(node.parent.uri),
+                    ))
+
+        for lit in sub.literals:
+            # remove literal definition
+            if lit.parent is not None:
+                self._transaction.remove((
+                    rdflib.URIRef(lit.uri),
+                    rdflib.RDFS.subClassOf,
+                    rdflib.URIRef(lit.parent.uri),
+                    ))
+
+        # add predicate, node, and literal hierarchies to the graph
+        for itm in itertools.chain(schema.predicates(), schema.nodes(), schema.literals()):
+            if itm.parent is not None:
+                self._transaction.add((rdflib.URIRef(itm.uri), rdflib.RDFS.subClassOf, rdflib.URIRef(itm.parent.uri)))
 
         # commit instance changes
         self.commit()
 
         # migrate schema
         self._schema = schema
+        self._filter_parser.schema = schema
 
-    def get(self, node_type: bsc.Node, query: ast.filter.FilterExpression) -> typing.Iterator[URI]:
-        raise NotImplementedError()
+    def get(
+            self,
+            node_type: bsc.Node,
+            query: typing.Optional[ast.filter.FilterExpression] = None,
+            ) -> typing.Iterator[URI]:
+        if node_type not in self.schema.nodes():
+            raise errors.ConsistencyError(f'{node_type} is not defined in the schema')
+        if not isinstance(query, ast.filter.FilterExpression):
+            raise TypeError(query)
+        for guid, in self._graph.query(self._filter_parser(node_type, query)):
+            yield URI(guid)
 
     def _has_type(self, subject: URI, node_type: bsc.Node) -> bool:
         """Return True if *subject* is a node of class *node_type* or a subclass thereof."""