From 791918039979d0743fd2ea4b9a5e74593ff96fd0 Mon Sep 17 00:00:00 2001
From: Matthias Baumgartner <dev@igsor.net>
Date: Mon, 19 Dec 2022 13:32:34 +0100
Subject: query ast file structures and essential interfaces

---
 bsfs/graph/graph.py                  |   5 +
 bsfs/query/__init__.py               |  20 +++
 bsfs/query/ast/__init__.py           |  24 ++++
 bsfs/query/ast/filter_.py            |  30 ++++
 bsfs/query/validator.py              |  35 +++++
 bsfs/triple_store/base.py            |   8 ++
 bsfs/triple_store/sparql.py          | 253 ----------------------------------
 bsfs/triple_store/sparql/__init__.py |  18 +++
 bsfs/triple_store/sparql/sparql.py   | 256 +++++++++++++++++++++++++++++++++++
 9 files changed, 396 insertions(+), 253 deletions(-)
 create mode 100644 bsfs/query/__init__.py
 create mode 100644 bsfs/query/ast/__init__.py
 create mode 100644 bsfs/query/ast/filter_.py
 create mode 100644 bsfs/query/validator.py
 delete mode 100644 bsfs/triple_store/sparql.py
 create mode 100644 bsfs/triple_store/sparql/__init__.py
 create mode 100644 bsfs/triple_store/sparql/sparql.py

(limited to 'bsfs')

diff --git a/bsfs/graph/graph.py b/bsfs/graph/graph.py
index b7b9f1c..10e5904 100644
--- a/bsfs/graph/graph.py
+++ b/bsfs/graph/graph.py
@@ -9,6 +9,7 @@ import os
 import typing
 
 # bsfs imports
+from bsfs.query import ast
 from bsfs.schema import Schema
 from bsfs.triple_store import TripleStoreBase
 from bsfs.utils import URI, typename
@@ -110,4 +111,8 @@ class Graph():
         type_ = self.schema.node(node_type)
         return _nodes.Nodes(self._backend, self._user, type_, {guid})
 
+    def get(self, node_type: URI, subject: ast.filter.FilterExpression) -> Nodes:
+        """Return a `Nodes` instance over all nodes of type *node_type* that match the *subject* query."""
+        raise NotImplementedError()
+
 ## EOF ##
diff --git a/bsfs/query/__init__.py b/bsfs/query/__init__.py
new file mode 100644
index 0000000..21c7389
--- /dev/null
+++ b/bsfs/query/__init__.py
@@ -0,0 +1,20 @@
+"""
+
+Part of the BlackStar filesystem (bsfs) module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from . import ast
+from . import validator as validate
+
+# exports
+__all__: typing.Sequence[str] = (
+    'ast',
+    'validate',
+    )
+
+## EOF ##
diff --git a/bsfs/query/ast/__init__.py b/bsfs/query/ast/__init__.py
new file mode 100644
index 0000000..0ee7385
--- /dev/null
+++ b/bsfs/query/ast/__init__.py
@@ -0,0 +1,24 @@
+"""Query AST components.
+
+The query AST consists of a Filter syntax tree.
+
+Classes beginning with an underscore (_) represent internal type hierarchies
+and should not be used for parsing. Note that the AST structures do not
+(and cannot) check semantic validity or consistency with a given schema.
+
+Part of the BlackStar filesystem (bsfs) module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from . import filter_ as filter
+
+# exports
+__all__: typing.Sequence[str] = (
+    'filter',
+    )
+
+## EOF ##
diff --git a/bsfs/query/ast/filter_.py b/bsfs/query/ast/filter_.py
new file mode 100644
index 0000000..4086fc1
--- /dev/null
+++ b/bsfs/query/ast/filter_.py
@@ -0,0 +1,30 @@
+"""Filter AST.
+
+Part of the BlackStar filesystem (bsfs) module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+from collections import abc
+import typing
+
+# exports
+__all__ : typing.Sequence[str] = []
+
+
+## code ##
+
+class _Expression(abc.Hashable):
+    def __repr__(self) -> str:
+        """Return the expressions's string representation."""
+        return f'{typename(self)}()'
+
+    def __hash__(self) -> int:
+        """Return the expression's integer representation."""
+        return hash(type(self))
+
+    def __eq__(self, other: typing.Any) -> bool:
+        """Return True if *self* and *other* are equivalent."""
+        return isinstance(other, type(self))
+
+## EOF ##
diff --git a/bsfs/query/validator.py b/bsfs/query/validator.py
new file mode 100644
index 0000000..ac3789a
--- /dev/null
+++ b/bsfs/query/validator.py
@@ -0,0 +1,35 @@
+"""
+
+Part of the BlackStar filesystem (bsfs) module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsfs imports
+from bsfs import schema as bsc
+
+# inner-module imports
+from . import ast
+
+# exports
+__all__ : typing.Sequence[str] = (
+    'Filter',
+    )
+
+
+## code ##
+
+class Filter():
+
+    # schema to validate against.
+    schema: bsc.Schema
+
+    def __init__(self, schema: bsc.Schema):
+        self.schema = schema
+
+    def parse(self, node: ast.filter.FilterExpression):
+        raise NotImplementedError()
+
+## EOF ##
diff --git a/bsfs/triple_store/base.py b/bsfs/triple_store/base.py
index 6561262..28ebb86 100644
--- a/bsfs/triple_store/base.py
+++ b/bsfs/triple_store/base.py
@@ -108,6 +108,14 @@ class TripleStoreBase(abc.ABC):
 
         """
 
+    @abc.abstractmethod
+    def get(
+            self,
+            node_type: bsc.Node,
+            query: ast.filter.FilterExpression,
+            ) -> typing.Iterator[URI]:
+        """Return guids of nodes of type *node_type* that match the *query*."""
+
     @abc.abstractmethod
     def exists(
             self,
diff --git a/bsfs/triple_store/sparql.py b/bsfs/triple_store/sparql.py
deleted file mode 100644
index 7516dff..0000000
--- a/bsfs/triple_store/sparql.py
+++ /dev/null
@@ -1,253 +0,0 @@
-"""
-
-Part of the BlackStar filesystem (bsfs) module.
-A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2022
-"""
-# imports
-import itertools
-import typing
-import rdflib
-
-# bsfs imports
-from bsfs import schema as bsc
-from bsfs.utils import errors, URI
-
-# inner-module imports
-from . import base
-
-
-# exports
-__all__: typing.Sequence[str] = (
-    'SparqlStore',
-    )
-
-
-## code ##
-
-class _Transaction():
-    """Lightweight rdflib transactions for in-memory databases."""
-
-    # graph instance.
-    _graph: rdflib.Graph
-
-    # current log of added triples.
-    _added: typing.List[typing.Any]
-
-    # current log of removed triples.
-    _removed: typing.List[typing.Any]
-
-    def __init__(self, graph: rdflib.Graph):
-        self._graph = graph
-        # initialize internal structures
-        self.commit()
-
-    def commit(self):
-        """Commit temporary changes."""
-        self._added = []
-        self._removed = []
-
-    def rollback(self):
-        """Undo changes since the last commit."""
-        for triple in self._added:
-            self._graph.remove(triple)
-        for triple in self._removed:
-            self._graph.add(triple)
-
-    def add(self, triple: typing.Any):
-        """Add a triple to the graph."""
-        if triple not in self._graph:
-            self._added.append(triple)
-            self._graph.add(triple)
-
-    def remove(self, triple: typing.Any):
-        """Remove a triple from the graph."""
-        if triple in self._graph:
-            self._removed.append(triple)
-            self._graph.remove(triple)
-
-
-class SparqlStore(base.TripleStoreBase):
-    """Sparql-based triple store.
-
-    The sparql triple store uses a third-party backend
-    (currently rdflib) to store triples and manages them via
-    the Sparql query language.
-
-    """
-
-    # The rdflib graph.
-    _graph: rdflib.Graph
-
-    # Current transaction.
-    _transaction: _Transaction
-
-    # The local schema.
-    _schema: bsc.Schema
-
-    def __init__(self):
-        super().__init__(None)
-        self._graph = rdflib.Graph()
-        self._transaction = _Transaction(self._graph)
-        self._schema = bsc.Schema.Empty()
-
-    # NOTE: mypy and pylint complain about the **kwargs not being listed (contrasting super)
-    # However, not having it here is clearer since it's explicit that there are no arguments.
-    @classmethod
-    def Open(cls) -> 'SparqlStore': # type: ignore [override] # pylint: disable=arguments-differ
-        return cls()
-
-    def commit(self):
-        self._transaction.commit()
-
-    def rollback(self):
-        self._transaction.rollback()
-
-    @property
-    def schema(self) -> bsc.Schema:
-        return self._schema
-
-    @schema.setter
-    def schema(self, schema: bsc.Schema):
-        # check args: Schema instanace
-        if not isinstance(schema, bsc.Schema):
-            raise TypeError(schema)
-        # check compatibility: No contradicting definitions
-        if not self.schema.consistent_with(schema):
-            raise errors.ConsistencyError(f'{schema} is inconsistent with {self.schema}')
-
-        # commit the current transaction
-        self.commit()
-
-        # adjust instances:
-        # nothing to do for added classes
-        # delete instances of removed classes
-
-        # get deleted classes
-        sub = self.schema - schema
-
-        # remove predicate instances
-        for pred in sub.predicates:
-            for src, trg in self._graph.subject_objects(rdflib.URIRef(pred.uri)):
-                self._transaction.remove((src, rdflib.URIRef(pred.uri), trg))
-
-        # remove node instances
-        for node in sub.nodes:
-            # iterate through node instances
-            for inst in self._graph.subjects(rdflib.RDF.type, rdflib.URIRef(node.uri)):
-                # remove triples where the instance is in the object position
-                for src, pred in self._graph.subject_predicates(inst):
-                    self._transaction.remove((src, pred, inst))
-                # remove triples where the instance is in the subject position
-                for pred, trg in self._graph.predicate_objects(inst):
-                    self._transaction.remove((inst, pred, trg))
-                # remove instance
-                self._transaction.remove((inst, rdflib.RDF.type, rdflib.URIRef(node.uri)))
-
-        # NOTE: Nothing to do for literals
-
-        # commit instance changes
-        self.commit()
-
-        # migrate schema
-        self._schema = schema
-
-
-    def _has_type(self, subject: URI, node_type: bsc.Node) -> bool:
-        """Return True if *subject* is a node of class *node_type* or a subclass thereof."""
-        if node_type not in self.schema.nodes():
-            raise errors.ConsistencyError(f'{node_type} is not defined in the schema')
-
-        subject_types = list(self._graph.objects(rdflib.URIRef(subject), rdflib.RDF.type))
-        if len(subject_types) == 0:
-            return False
-        if len(subject_types) == 1:
-            node = self.schema.node(URI(subject_types[0])) # type: ignore [arg-type] # URI is a subtype of str
-            if node == node_type:
-                return True
-            if node_type in node.parents():
-                return True
-            return False
-        raise errors.UnreachableError()
-
-    def exists(
-            self,
-            node_type: bsc.Node,
-            guids: typing.Iterable[URI],
-            ) -> typing.Iterable[URI]:
-        return (subj for subj in guids if self._has_type(subj, node_type))
-
-    def create(
-            self,
-            node_type: bsc.Node,
-            guids: typing.Iterable[URI],
-            ):
-        # check node_type
-        if node_type not in self.schema.nodes():
-            raise errors.ConsistencyError(f'{node_type} is not defined in the schema')
-        # check and create guids
-        for guid in guids:
-            subject = rdflib.URIRef(guid)
-            # check node existence
-            if (subject, rdflib.RDF.type, None) in self._graph:
-                # FIXME: node exists and may have a different type! ignore? raise? report?
-                continue
-            # add node
-            self._transaction.add((subject, rdflib.RDF.type, rdflib.URIRef(node_type.uri)))
-
-    def set(
-            self,
-            node_type: bsc.Node,
-            guids: typing.Iterable[URI],
-            predicate: bsc.Predicate,
-            values: typing.Iterable[typing.Any],
-            ):
-        # check node_type
-        if node_type not in self.schema.nodes():
-            raise errors.ConsistencyError(f'{node_type} is not defined in the schema')
-        # check predicate
-        if predicate not in self.schema.predicates():
-            raise errors.ConsistencyError(f'{predicate} is not defined in the schema')
-        if not node_type <= predicate.domain:
-            raise errors.ConsistencyError(f'{node_type} must be a subclass of {predicate.domain}')
-        # NOTE: predicate.range is in the schema since predicate is in the schema.
-        # materialize values
-        values = set(values)
-        # check values
-        if len(values) == 0:
-            return
-        if predicate.unique and len(values) != 1:
-            raise ValueError(values)
-        if isinstance(predicate.range, bsc.Node):
-            values = set(values) # materialize to safeguard against iterators passed as argument
-            inconsistent = {val for val in values if not self._has_type(val, predicate.range)}
-            # catches nodes that don't exist and nodes that have an inconsistent type
-            if len(inconsistent) > 0:
-                raise errors.InstanceError(inconsistent)
-        # check guids
-        # FIXME: Fail or skip inexistent nodes?
-        guids = set(guids)
-        inconsistent = {guid for guid in guids if not self._has_type(guid, node_type)}
-        if len(inconsistent) > 0:
-            raise errors.InstanceError(inconsistent)
-
-        # add triples
-        pred = rdflib.URIRef(predicate.uri)
-        for guid, value in itertools.product(guids, values):
-            guid = rdflib.URIRef(guid)
-            # convert value
-            if isinstance(predicate.range, bsc.Literal):
-                value = rdflib.Literal(value, datatype=rdflib.URIRef(predicate.range.uri))
-            elif isinstance(predicate.range, bsc.Node):
-                value = rdflib.URIRef(value)
-            else:
-                raise errors.UnreachableError()
-            # clear triples for unique predicates
-            if predicate.unique:
-                for obj in self._graph.objects(guid, pred):
-                    if obj != value:
-                        self._transaction.remove((guid, pred, obj))
-            # add triple
-            self._transaction.add((guid, pred, value))
-
-## EOF ##
diff --git a/bsfs/triple_store/sparql/__init__.py b/bsfs/triple_store/sparql/__init__.py
new file mode 100644
index 0000000..285334a
--- /dev/null
+++ b/bsfs/triple_store/sparql/__init__.py
@@ -0,0 +1,18 @@
+"""
+
+Part of the BlackStar filesystem (bsfs) module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from .sparql import SparqlStore
+
+# exports
+__all__: typing.Sequence[str] = (
+    'SparqlStore',
+    )
+
+## EOF ##
diff --git a/bsfs/triple_store/sparql/sparql.py b/bsfs/triple_store/sparql/sparql.py
new file mode 100644
index 0000000..fff540a
--- /dev/null
+++ b/bsfs/triple_store/sparql/sparql.py
@@ -0,0 +1,256 @@
+"""
+
+Part of the BlackStar filesystem (bsfs) module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import itertools
+import typing
+import rdflib
+
+# bsfs imports
+from bsfs import schema as bsc
+from bsfs.query import ast
+from bsfs.utils import errors, URI
+
+# inner-module imports
+from . import base
+
+
+# exports
+__all__: typing.Sequence[str] = (
+    'SparqlStore',
+    )
+
+
+## code ##
+
+class _Transaction():
+    """Lightweight rdflib transactions for in-memory databases."""
+
+    # graph instance.
+    _graph: rdflib.Graph
+
+    # current log of added triples.
+    _added: typing.List[typing.Any]
+
+    # current log of removed triples.
+    _removed: typing.List[typing.Any]
+
+    def __init__(self, graph: rdflib.Graph):
+        self._graph = graph
+        # initialize internal structures
+        self.commit()
+
+    def commit(self):
+        """Commit temporary changes."""
+        self._added = []
+        self._removed = []
+
+    def rollback(self):
+        """Undo changes since the last commit."""
+        for triple in self._added:
+            self._graph.remove(triple)
+        for triple in self._removed:
+            self._graph.add(triple)
+
+    def add(self, triple: typing.Any):
+        """Add a triple to the graph."""
+        if triple not in self._graph:
+            self._added.append(triple)
+            self._graph.add(triple)
+
+    def remove(self, triple: typing.Any):
+        """Remove a triple from the graph."""
+        if triple in self._graph:
+            self._removed.append(triple)
+            self._graph.remove(triple)
+
+
+class SparqlStore(base.TripleStoreBase):
+    """Sparql-based triple store.
+
+    The sparql triple store uses a third-party backend
+    (currently rdflib) to store triples and manages them via
+    the Sparql query language.
+
+    """
+
+    # The rdflib graph.
+    _graph: rdflib.Graph
+
+    # Current transaction.
+    _transaction: _Transaction
+
+    # The local schema.
+    _schema: bsc.Schema
+
+    def __init__(self):
+        super().__init__(None)
+        self._graph = rdflib.Graph()
+        self._transaction = _Transaction(self._graph)
+        self._schema = bsc.Schema.Empty()
+
+    # NOTE: mypy and pylint complain about the **kwargs not being listed (contrasting super)
+    # However, not having it here is clearer since it's explicit that there are no arguments.
+    @classmethod
+    def Open(cls) -> 'SparqlStore': # type: ignore [override] # pylint: disable=arguments-differ
+        return cls()
+
+    def commit(self):
+        self._transaction.commit()
+
+    def rollback(self):
+        self._transaction.rollback()
+
+    @property
+    def schema(self) -> bsc.Schema:
+        return self._schema
+
+    @schema.setter
+    def schema(self, schema: bsc.Schema):
+        # check args: Schema instanace
+        if not isinstance(schema, bsc.Schema):
+            raise TypeError(schema)
+        # check compatibility: No contradicting definitions
+        if not self.schema.consistent_with(schema):
+            raise errors.ConsistencyError(f'{schema} is inconsistent with {self.schema}')
+
+        # commit the current transaction
+        self.commit()
+
+        # adjust instances:
+        # nothing to do for added classes
+        # delete instances of removed classes
+
+        # get deleted classes
+        sub = self.schema - schema
+
+        # remove predicate instances
+        for pred in sub.predicates:
+            for src, trg in self._graph.subject_objects(rdflib.URIRef(pred.uri)):
+                self._transaction.remove((src, rdflib.URIRef(pred.uri), trg))
+
+        # remove node instances
+        for node in sub.nodes:
+            # iterate through node instances
+            for inst in self._graph.subjects(rdflib.RDF.type, rdflib.URIRef(node.uri)):
+                # remove triples where the instance is in the object position
+                for src, pred in self._graph.subject_predicates(inst):
+                    self._transaction.remove((src, pred, inst))
+                # remove triples where the instance is in the subject position
+                for pred, trg in self._graph.predicate_objects(inst):
+                    self._transaction.remove((inst, pred, trg))
+                # remove instance
+                self._transaction.remove((inst, rdflib.RDF.type, rdflib.URIRef(node.uri)))
+
+        # NOTE: Nothing to do for literals
+
+        # commit instance changes
+        self.commit()
+
+        # migrate schema
+        self._schema = schema
+
+    def get(self, node_type: bsc.Node, query: ast.filter.FilterExpression) -> typing.Iterator[URI]:
+        raise NotImplementedError()
+
+    def _has_type(self, subject: URI, node_type: bsc.Node) -> bool:
+        """Return True if *subject* is a node of class *node_type* or a subclass thereof."""
+        if node_type not in self.schema.nodes():
+            raise errors.ConsistencyError(f'{node_type} is not defined in the schema')
+
+        subject_types = list(self._graph.objects(rdflib.URIRef(subject), rdflib.RDF.type))
+        if len(subject_types) == 0:
+            return False
+        if len(subject_types) == 1:
+            node = self.schema.node(URI(subject_types[0])) # type: ignore [arg-type] # URI is a subtype of str
+            if node == node_type:
+                return True
+            if node_type in node.parents():
+                return True
+            return False
+        raise errors.UnreachableError()
+
+    def exists(
+            self,
+            node_type: bsc.Node,
+            guids: typing.Iterable[URI],
+            ) -> typing.Iterable[URI]:
+        return (subj for subj in guids if self._has_type(subj, node_type))
+
+    def create(
+            self,
+            node_type: bsc.Node,
+            guids: typing.Iterable[URI],
+            ):
+        # check node_type
+        if node_type not in self.schema.nodes():
+            raise errors.ConsistencyError(f'{node_type} is not defined in the schema')
+        # check and create guids
+        for guid in guids:
+            subject = rdflib.URIRef(guid)
+            # check node existence
+            if (subject, rdflib.RDF.type, None) in self._graph:
+                # FIXME: node exists and may have a different type! ignore? raise? report?
+                continue
+            # add node
+            self._transaction.add((subject, rdflib.RDF.type, rdflib.URIRef(node_type.uri)))
+
+    def set(
+            self,
+            node_type: bsc.Node,
+            guids: typing.Iterable[URI],
+            predicate: bsc.Predicate,
+            values: typing.Iterable[typing.Any],
+            ):
+        # check node_type
+        if node_type not in self.schema.nodes():
+            raise errors.ConsistencyError(f'{node_type} is not defined in the schema')
+        # check predicate
+        if predicate not in self.schema.predicates():
+            raise errors.ConsistencyError(f'{predicate} is not defined in the schema')
+        if not node_type <= predicate.domain:
+            raise errors.ConsistencyError(f'{node_type} must be a subclass of {predicate.domain}')
+        # NOTE: predicate.range is in the schema since predicate is in the schema.
+        # materialize values
+        values = set(values)
+        # check values
+        if len(values) == 0:
+            return
+        if predicate.unique and len(values) != 1:
+            raise ValueError(values)
+        if isinstance(predicate.range, bsc.Node):
+            values = set(values) # materialize to safeguard against iterators passed as argument
+            inconsistent = {val for val in values if not self._has_type(val, predicate.range)}
+            # catches nodes that don't exist and nodes that have an inconsistent type
+            if len(inconsistent) > 0:
+                raise errors.InstanceError(inconsistent)
+        # check guids
+        # FIXME: Fail or skip inexistent nodes?
+        guids = set(guids)
+        inconsistent = {guid for guid in guids if not self._has_type(guid, node_type)}
+        if len(inconsistent) > 0:
+            raise errors.InstanceError(inconsistent)
+
+        # add triples
+        pred = rdflib.URIRef(predicate.uri)
+        for guid, value in itertools.product(guids, values):
+            guid = rdflib.URIRef(guid)
+            # convert value
+            if isinstance(predicate.range, bsc.Literal):
+                value = rdflib.Literal(value, datatype=rdflib.URIRef(predicate.range.uri))
+            elif isinstance(predicate.range, bsc.Node):
+                value = rdflib.URIRef(value)
+            else:
+                raise errors.UnreachableError()
+            # clear triples for unique predicates
+            if predicate.unique:
+                for obj in self._graph.objects(guid, pred):
+                    if obj != value:
+                        self._transaction.remove((guid, pred, obj))
+            # add triple
+            self._transaction.add((guid, pred, value))
+
+## EOF ##
-- 
cgit v1.2.3


From a0f2308adcb226d28de3355bc7115a6d9b669462 Mon Sep 17 00:00:00 2001
From: Matthias Baumgartner <dev@igsor.net>
Date: Mon, 19 Dec 2022 13:40:02 +0100
Subject: import fixes

---
 bsfs/graph/graph.py                |   2 +-
 bsfs/query/validator.py            | 177 ++++++++++++++++++++++++++++++++++++-
 bsfs/triple_store/base.py          |   3 +-
 bsfs/triple_store/sparql/sparql.py |   2 +-
 4 files changed, 179 insertions(+), 5 deletions(-)

(limited to 'bsfs')

diff --git a/bsfs/graph/graph.py b/bsfs/graph/graph.py
index 10e5904..51fe75d 100644
--- a/bsfs/graph/graph.py
+++ b/bsfs/graph/graph.py
@@ -111,7 +111,7 @@ class Graph():
         type_ = self.schema.node(node_type)
         return _nodes.Nodes(self._backend, self._user, type_, {guid})
 
-    def get(self, node_type: URI, subject: ast.filter.FilterExpression) -> Nodes:
+    def get(self, node_type: URI, subject: ast.filter.FilterExpression) -> _nodes.Nodes:
         """Return a `Nodes` instance over all nodes of type *node_type* that match the *subject* query."""
         raise NotImplementedError()
 
diff --git a/bsfs/query/validator.py b/bsfs/query/validator.py
index ac3789a..123b947 100644
--- a/bsfs/query/validator.py
+++ b/bsfs/query/validator.py
@@ -29,7 +29,180 @@ class Filter():
     def __init__(self, schema: bsc.Schema):
         self.schema = schema
 
-    def parse(self, node: ast.filter.FilterExpression):
-        raise NotImplementedError()
+    def parse(self, node: ast.filter.FilterExpression, subject: bsc.types._Vertex):
+        # subject is a node type
+        if not isinstance(subject, bsc.Node):
+            raise errors.ConsistencyError(f'Expected a node, found {subject}')
+        # subject exists in the schema
+        if subject not in self.schema.nodes:
+            raise errors.ConsistencyError(f'Invalid node type {subject}')
+        # root expression is valid
+        self._parse(node, subject)
+        # all tests passed
+        return True
+
+
+    def _parse_numerical_expression(self, node: ast.filter.FilterExpression, subject: bsc.types._Vertex):
+        if isinstance(node, ast.filter.And):
+            return self._and(node, subject)
+        elif isinstance(node, ast.filter.Or):
+            return self._or(node, subject)
+        elif isinstance(node, ast.filter.LessThan):
+            return self._lessThan(node, subject)
+        elif isinstance(node, ast.filter.GreaterThan):
+            return self._greaterThan(node, subject)
+        elif isinstance(node, ast.filter.Equals):
+            return self._equals(node, subject, numerical=True)
+        else:
+            raise errors.ConsistencyError(f'Expected a numerical expression, found {node}')
+
+
+    def __branch(self, node: typing.Union[ast.filter.Any, ast.filter.And], subject: bsc.types._Vertex):
+        # subject is a node type
+        if not isinstance(subject, bsc.Node):
+            raise errors.ConsistencyError(f'Expected a node, found {subject}')
+        # subject exists in the schema
+        if subject not in self.schema.nodes:
+            raise errors.ConsistencyError(f'Invalid node type {subject}')
+        # predicate is valid
+        dom, rng = self._parse_predicate_expression(node.predicate)
+        # subject is a subtype of the predicate's domain
+        if not subject <= dom:
+            raise errors.ConsistencyError(f'Expected type {dom}, found {subject}')
+        # child expression is valid
+        self._parse_filter_expression(node.expr, rng)
+
+    def _any(self, node: ast.filter.Any, subject: bsc.types._Vertex):
+        return self.__branch(node, subject)
+
+    def _all(self, node: ast.filter.All, subject: bsc.types._Vertex):
+        return self.__branch(node, subject)
+
+
+    def __agg(self, node: typing.Union[ast.filter.And, ast.filter.Or], subject: bsc.types._Vertex):
+        for expr in node:
+            # child expression is valid
+            self._parse_filter_expression(expr, subject)
+
+    def _and(self, node: ast.filter.And, subject: bsc.types._Vertex):
+        return self.__agg(node, subject)
+
+    def _or(self, node: ast.filter.Or, subject: bsc.types._Vertex):
+        return self.__agg(node, subject)
+
+
+    def _not(self, node: ast.filter.Not, subject: bsc.types._Vertex):
+        # child expression is valid
+        self._parse_filter_expression(node.expr, subject)
+
+
+    def _has(self, node: ast.filter.Has, subject: bsc.types._Vertex):
+        # subject is a node type
+        if not isinstance(subject, bsc.Node):
+            raise errors.ConsistencyError(f'Expected a node, found {subject}')
+        # subject exists in the schema
+        if subject not in self.schema.nodes:
+            raise errors.ConsistencyError(f'Invalid node type {subject}')
+        # predicate is valid
+        dom, rng = self._parse_predicate_expression(node.predicate)
+        # subject is a subtype of the predicate's domain
+        if not subject <= dom:
+            raise errors.ConsistencyError(f'Expected type {dom}, found {subject}')
+        # node.count is a numerical expression
+        self._parse_numerical_expression(node.count, self.schema.literal(ns.xsd.numerical))
+
+
+    def _equals(self, node: ast.filter.Equals, subject: bsc.types._Vertex, numerical: bool = False):
+        # subject is a literal
+        #if not isinstance(subject, bsc.Literal):
+        #    raise errors.ConsistencyError(f'Expected a literal, found {subject}')
+        if isinstance(subject, bsc.Node):
+            # FIXME: How to handle this case?
+            # FIXME: How to check if a NodeType is acceptable?
+            # FIXME: Maybe use flags to control what is expected as node identifiers?
+            from bsfs.graph.nodes import Nodes # FIXME
+            if not isinstance(node.value, Nodes) and not isinstance(node.value, URI):
+                raise errors.ConsistencyError(f'Expected a Nodes or URI, found {node.value}')
+        elif isinstance(subject, bsc.Literal):
+            # literal exists in the schema
+            if subject not in self.schema.literals:
+                raise errors.ConsistencyError(f'Invalid literal type {subject}')
+        else:
+            # FIXME:
+            raise errors.ConsistencyError(f'Expected a literal, found {subject}')
+        # node.value is numeric (if requested)
+        if numerical and not isinstance(node.value, float) and not isinstance(node.value, int):
+            raise errors.ConsistencyError(f'Expected a numerical value (int or float), found {node.value}')
+        # NOTE: We cannot check if node.value agrees with the subject since we don't know
+        # all literal types, their hierarchy, and how the backend converts datatypes.
+
+
+    def _substring(self, node: ast.filter.Substring, subject: bsc.types._Vertex):
+        # subject is a literal
+        if not isinstance(subject, bsc.Literal):
+            raise errors.ConsistencyError(f'Expected a literal, found {subject}')
+        # literal exists in the schema
+        if subject not in self.schema.literals:
+            raise errors.ConsistencyError(f'Invalid literal type {subject}')
+        # node.value matches literal datatype
+        if not subject.is_a(ns.xsd.string):
+            raise errors.ConsistencyError(f'Expected a string literal, found {subject}')
+
+
+    def _lessThan(self, node: ast.filter.LessThan, subject: bsc.types._Vertex):
+        # subject is a literal
+        if not isinstance(subject, bsc.Literal):
+            raise errors.ConsistencyError(f'Expected a literal, found {subject}')
+        # literal exists in the schema
+        if subject not in self.schema.literals:
+            raise errors.ConsistencyError(f'Invalid literal type {subject}')
+        # subject is numerical
+        if not subject.is_a(ns.xsd.numerical):
+            raise errors.ConsistencyError(f'Expected a numerical literal, found {subject}')
+
+
+    def _greaterThan(self, node: ast.filter.GreaterThan, subject: bsc.types._Vertex):
+        # subject is a literal
+        if not isinstance(subject, bsc.Literal):
+            raise errors.ConsistencyError(f'Expected a literal, found {subject}')
+        # literal exists in the schema
+        if subject not in self.schema.literals:
+            raise errors.ConsistencyError(f'Invalid literal type {subject}')
+        # subject is numerical
+        if not subject.is_a(ns.xsd.numerical):
+            raise errors.ConsistencyError(f'Expected a numerical literal, found {subject}')
+
+
+    def _predicate(self, node: ast.filter.Predicate):
+        try:
+            # predicate exists in the schema
+            pred = self.schema.predicate(node.predicate)
+        except KeyError:
+            raise errors.ConsistencyError(f'') # FIXME
+        if node.reverse:
+            return pred.range, pred.domain
+        else:
+            return pred.domain, pred.range
+
+
+    def _oneOf(self, node: ast.filter.OneOf):
+        dom, rng = None, None
+        for pred in node:
+            try:
+                # parse child expression
+                subdom, subrng = self._parse_predicate_expression(pred)
+                # domain and range must be related across all child expressions
+                if not subdom <= dom and not subdom >= dom:
+                    raise errors.ConsistencyError(f'') # FIXME
+                if not subrng <= rng and not subrng >= rng:
+                    raise errors.ConsistencyError(f'') # FIXME
+                # determine overall domain and range
+                if dom is None or subdom < dom: # pick most specific domain
+                    dom = subdom
+                if rng is None or subrng > rng: # pick most generic range
+                    rng = subrng
+            except KeyError:
+                raise errors.ConsistencyError(f'')
+        return dom, rng
 
 ## EOF ##
diff --git a/bsfs/triple_store/base.py b/bsfs/triple_store/base.py
index 28ebb86..5ff9523 100644
--- a/bsfs/triple_store/base.py
+++ b/bsfs/triple_store/base.py
@@ -9,6 +9,7 @@ import abc
 import typing
 
 # inner-module imports
+from bsfs.query import ast
 from bsfs.utils import URI, typename
 import bsfs.schema  as _schema
 
@@ -111,7 +112,7 @@ class TripleStoreBase(abc.ABC):
     @abc.abstractmethod
     def get(
             self,
-            node_type: bsc.Node,
+            node_type: _schema.Node,
             query: ast.filter.FilterExpression,
             ) -> typing.Iterator[URI]:
         """Return guids of nodes of type *node_type* that match the *query*."""
diff --git a/bsfs/triple_store/sparql/sparql.py b/bsfs/triple_store/sparql/sparql.py
index fff540a..7172f34 100644
--- a/bsfs/triple_store/sparql/sparql.py
+++ b/bsfs/triple_store/sparql/sparql.py
@@ -15,7 +15,7 @@ from bsfs.query import ast
 from bsfs.utils import errors, URI
 
 # inner-module imports
-from . import base
+from .. import base
 
 
 # exports
-- 
cgit v1.2.3


From 383fa8fd5c2e4b67089b4c5b654ebade51382f2c Mon Sep 17 00:00:00 2001
From: Matthias Baumgartner <dev@igsor.net>
Date: Thu, 22 Dec 2022 20:27:49 +0100
Subject: filter ast definition and validation

---
 bsfs/query/ast/__init__.py |   2 +-
 bsfs/query/ast/filter_.py  | 405 ++++++++++++++++++++++++++++++++++++++++++++-
 bsfs/query/validator.py    | 336 +++++++++++++++++++------------------
 bsfs/utils/__init__.py     |   3 +-
 bsfs/utils/commons.py      |  34 ++++
 bsfs/utils/errors.py       |   3 +
 6 files changed, 620 insertions(+), 163 deletions(-)

(limited to 'bsfs')

diff --git a/bsfs/query/ast/__init__.py b/bsfs/query/ast/__init__.py
index 0ee7385..704d051 100644
--- a/bsfs/query/ast/__init__.py
+++ b/bsfs/query/ast/__init__.py
@@ -14,7 +14,7 @@ Author: Matthias Baumgartner, 2022
 import typing
 
 # inner-module imports
-from . import filter_ as filter
+from . import filter_ as filter # pylint: disable=redefined-builtin
 
 # exports
 __all__: typing.Sequence[str] = (
diff --git a/bsfs/query/ast/filter_.py b/bsfs/query/ast/filter_.py
index 4086fc1..b129ded 100644
--- a/bsfs/query/ast/filter_.py
+++ b/bsfs/query/ast/filter_.py
@@ -1,5 +1,27 @@
 """Filter AST.
 
+Note that it is easily possible to construct an AST that is inconsistent with
+a given schema. Furthermore, it is possible to construct a semantically invalid
+AST which that cannot be parsed correctly or includes contradicting statements.
+The AST nodes do not (and cannot) check such issues.
+
+For example, consider the following AST:
+
+>>> Any(ns.bse.collection,
+...     And(
+...         Equals('hello'),
+...         Any(ns.bsm.guid, Any(ns.bsm.guid, Equals('hello'))),
+...         Any(ns.bst.label, Equals('world')),
+...         All(ns.bst.label, Not(Equals('world'))),
+...     )
+... )
+
+This AST has multiple issues that are not verified upon its creation:
+* A condition on a non-literal.
+* A Filter on a literal.
+* Conditions exclude each other
+* The predicate along the branch have incompatible domains and ranges.
+
 Part of the BlackStar filesystem (bsfs) module.
 A copy of the license is provided with the project.
 Author: Matthias Baumgartner, 2022
@@ -8,12 +30,45 @@ Author: Matthias Baumgartner, 2022
 from collections import abc
 import typing
 
+# bsfs imports
+from bsfs.utils import URI, typename, normalize_args
+
+# inner-module imports
+#from . import utils
+
 # exports
-__all__ : typing.Sequence[str] = []
+__all__ : typing.Sequence[str] = (
+    # base classes
+    'FilterExpression',
+    'PredicateExpression',
+    # predicate expressions
+    'OneOf',
+    'Predicate',
+    # branching
+    'All',
+    'Any',
+    # aggregators
+    'And',
+    'Or',
+    # value matchers
+    'Equals',
+    'Substring',
+    'EndsWith',
+    'StartsWith',
+    # range matchers
+    'GreaterThan',
+    'LessThan',
+    # misc
+    'Has',
+    'Is',
+    'Not',
+    )
 
 
 ## code ##
 
+# pylint: disable=too-few-public-methods # Many expressions use mostly magic methods
+
 class _Expression(abc.Hashable):
     def __repr__(self) -> str:
         """Return the expressions's string representation."""
@@ -27,4 +82,352 @@ class _Expression(abc.Hashable):
         """Return True if *self* and *other* are equivalent."""
         return isinstance(other, type(self))
 
+
+class FilterExpression(_Expression):
+    """Generic Filter expression."""
+
+
+class PredicateExpression(_Expression):
+    """Generic Predicate expression."""
+
+
+class _Branch(FilterExpression):
+    """Branch the filter along a predicate."""
+
+    # predicate to follow.
+    predicate: PredicateExpression
+
+    # child expression to evaluate.
+    expr: FilterExpression
+
+    def __init__(
+            self,
+            predicate: typing.Union[PredicateExpression, URI],
+            expr: FilterExpression,
+            ):
+        # process predicate argument
+        if isinstance(predicate, URI):
+            predicate = Predicate(predicate)
+        elif not isinstance(predicate, PredicateExpression):
+            raise TypeError(predicate)
+        # process expression argument
+        if not isinstance(expr, FilterExpression):
+            raise TypeError(expr)
+        # assign members
+        self.predicate = predicate
+        self.expr = expr
+
+    def __repr__(self) -> str:
+        return f'{typename(self)}({self.predicate}, {self.expr})'
+
+    def __hash__(self) -> int:
+        return hash((super().__hash__(), self.predicate, self.expr))
+
+    def __eq__(self, other) -> bool:
+        return super().__eq__(other) \
+           and self.predicate == other.predicate \
+           and self.expr == other.expr
+
+class Any(_Branch):
+    """Any (and at least one) triple matches."""
+
+
+class All(_Branch):
+    """All (and at least one) triples match."""
+
+
+class _Agg(FilterExpression, abc.Collection):
+    """Combine multiple expressions."""
+
+    # child expressions
+    expr: typing.Set[FilterExpression]
+
+    def __init__(
+            self,
+            *expr: typing.Union[FilterExpression,
+                                typing.Iterable[FilterExpression],
+                                typing.Iterator[FilterExpression]]
+            ):
+        # unfold arguments
+        unfolded = set(normalize_args(*expr))
+        # check type
+        if not all(isinstance(e, FilterExpression) for e in unfolded):
+            raise TypeError(expr)
+        # assign member
+        self.expr = unfolded
+
+    def __contains__(self, expr: typing.Any) -> bool:
+        """Return True if *expr* is among the child expressions."""
+        return expr in self.expr
+
+    def __iter__(self) -> typing.Iterator[FilterExpression]:
+        """Iterator over child expressions."""
+        return iter(self.expr)
+
+    def __len__(self) -> int:
+        """Number of child expressions."""
+        return len(self.expr)
+
+    def __repr__(self) -> str:
+        return f'{typename(self)}({self.expr})'
+
+    def __hash__(self) -> int:
+        return hash((super().__hash__(), tuple(self.expr))) # FIXME: Unique hash of different orders over self.expr
+
+    def __eq__(self, other) -> bool:
+        return super().__eq__(other) and self.expr == other.expr
+
+
+class And(_Agg):
+    """All conditions match."""
+
+
+class Or(_Agg):
+    """At least one condition matches."""
+
+
+class Not(FilterExpression):
+    """Invert a statement."""
+
+    # child expression
+    expr: FilterExpression
+
+    def __init__(self, expr: FilterExpression):
+        # check argument
+        if not isinstance(expr, FilterExpression):
+            raise TypeError(expr)
+        # assign member
+        self.expr = expr
+
+    def __repr__(self) -> str:
+        return f'{typename(self)}({self.expr})'
+
+    def __hash__(self) -> int:
+        return hash((super().__hash__(), self.expr))
+
+    def __eq__(self, other: typing.Any) -> bool:
+        return super().__eq__(other) and self.expr == other.expr
+
+
+class Has(FilterExpression):
+    """Has predicate N times"""
+
+    # predicate to follow.
+    predicate: PredicateExpression
+
+    # target count
+    count: FilterExpression
+
+    def __init__(
+            self,
+            predicate: typing.Union[PredicateExpression, URI],
+            count: typing.Optional[typing.Union[FilterExpression, int]] = None,
+            ):
+        # check predicate
+        if isinstance(predicate, URI):
+            predicate = Predicate(predicate)
+        elif not isinstance(predicate, PredicateExpression):
+            raise TypeError(predicate)
+        # check count
+        if count is None:
+            count = GreaterThan(1, strict=False)
+        elif isinstance(count, int):
+            count = Equals(count)
+        elif not isinstance(count, FilterExpression):
+            raise TypeError(count)
+        # assign members
+        self.predicate = predicate
+        self.count = count
+
+    def __repr__(self) -> str:
+        return f'{typename(self)}({self.predicate}, {self.count})'
+
+    def __hash__(self) -> int:
+        return hash((super().__hash__(), self.predicate, self.count))
+
+    def __eq__(self, other) -> bool:
+        return super().__eq__(other) \
+           and self.predicate == other.predicate \
+           and self.count == other.count
+
+
+class _Value(FilterExpression):
+    """
+    """
+
+    # target value.
+    value: typing.Any
+
+    def __init__(self, value: typing.Any):
+        self.value = value
+
+    def __repr__(self) -> str:
+        return f'{typename(self)}({self.value})'
+
+    def __hash__(self) -> int:
+        return hash((super().__hash__(), self.value))
+
+    def __eq__(self, other) -> bool:
+        return super().__eq__(other) and self.value == other.value
+
+
+class Is(_Value):
+    """Match the URI of a node."""
+
+
+class Equals(_Value):
+    """Value matches exactly.
+    NOTE: Value format must correspond to literal type; can be a string, a number, or a Node
+    """
+
+
+class Substring(_Value):
+    """Value matches a substring
+    NOTE: value format must be a string
+    """
+
+
+class StartsWith(_Value):
+    """Value begins with a given string."""
+
+
+class EndsWith(_Value):
+    """Value ends with a given string."""
+
+
+class _Bounded(FilterExpression):
+    """
+    """
+
+    # bound.
+    threshold: float
+
+    # closed (True) or open (False) bound.
+    strict: bool
+
+    def __init__(
+            self,
+            threshold: float,
+            strict: bool = True,
+            ):
+        self.threshold = float(threshold)
+        self.strict = bool(strict)
+
+    def __repr__(self) -> str:
+        return f'{typename(self)}({self.threshold}, {self.strict})'
+
+    def __hash__(self) -> int:
+        return hash((super().__hash__(), self.threshold, self.strict))
+
+    def __eq__(self, other) -> bool:
+        return super().__eq__(other) \
+           and self.threshold == other.threshold \
+           and self.strict == other.strict
+
+
+
+class LessThan(_Bounded):
+    """Value is (strictly) smaller than threshold.
+    NOTE: only on numerical literals
+    """
+
+
+class GreaterThan(_Bounded):
+    """Value is (strictly) larger than threshold
+    NOTE: only on numerical literals
+    """
+
+
+class Predicate(PredicateExpression):
+    """A single predicate."""
+
+    # predicate URI
+    predicate: URI
+
+    # reverse the predicate's direction
+    reverse: bool
+
+    def __init__(
+            self,
+            predicate: URI,
+            reverse: typing.Optional[bool] = False,
+            ):
+        # check arguments
+        if not isinstance(predicate, URI):
+            raise TypeError(predicate)
+        # assign members
+        self.predicate = predicate
+        self.reverse = bool(reverse)
+
+    def __repr__(self) -> str:
+        return f'{typename(self)}({self.predicate}, {self.reverse})'
+
+    def __hash__(self) -> int:
+        return hash((super().__hash__(), self.predicate, self.reverse))
+
+    def __eq__(self, other) -> bool:
+        return super().__eq__(other) \
+           and self.predicate == other.predicate \
+           and self.reverse == other.reverse
+
+
+class OneOf(PredicateExpression, abc.Collection):
+    """A set of predicate alternatives.
+
+    The predicates' domains must be ascendants or descendants of each other.
+    The overall domain is the most specific one.
+
+    The predicate's domains must be ascendants or descendants of each other.
+    The overall range is the most generic one.
+    """
+
+    # predicate alternatives
+    expr: typing.Set[PredicateExpression]
+
+    def __init__(self, *expr: typing.Union[PredicateExpression, URI]):
+        # unfold arguments
+        unfolded = set(normalize_args(*expr)) # type: ignore [arg-type] # this is getting too complex...
+        # check arguments
+        if len(unfolded) == 0:
+            raise AttributeError('expected at least one expression, found none')
+        # ensure PredicateExpression
+        unfolded = {Predicate(e) if isinstance(e, URI) else e for e in unfolded}
+        # check type
+        if not all(isinstance(e, PredicateExpression) for e in unfolded):
+            raise TypeError(expr)
+        # assign member
+        self.expr = unfolded
+
+    def __contains__(self, expr: typing.Any) -> bool:
+        """Return True if *expr* is among the child expressions."""
+        return expr in self.expr
+
+    def __iter__(self) -> typing.Iterator[PredicateExpression]:
+        """Iterator over child expressions."""
+        return iter(self.expr)
+
+    def __len__(self) -> int:
+        """Number of child expressions."""
+        return len(self.expr)
+
+    def __repr__(self) -> str:
+        return f'{typename(self)}({self.expr})'
+
+    def __hash__(self) -> int:
+        return hash((super().__hash__(), tuple(self.expr))) # FIXME: Unique hash of different orders over self.expr
+
+    def __eq__(self, other) -> bool:
+        return super().__eq__(other) and self.expr == other.expr
+
+
+# Helpers
+
+def IsIn(*values): # pylint: disable=invalid-name # explicitly mimics an expression
+    """Match any of the given URIs."""
+    return Or(Is(value) for value in normalize_args(*values))
+
+def IsNotIn(*values): # pylint: disable=invalid-name # explicitly mimics an expression
+    """Match none of the given URIs."""
+    return Not(IsIn(*values))
+
 ## EOF ##
diff --git a/bsfs/query/validator.py b/bsfs/query/validator.py
index 123b947..352203a 100644
--- a/bsfs/query/validator.py
+++ b/bsfs/query/validator.py
@@ -9,6 +9,8 @@ import typing
 
 # bsfs imports
 from bsfs import schema as bsc
+from bsfs.namespace import ns
+from bsfs.utils import errors, typename
 
 # inner-module imports
 from . import ast
@@ -22,6 +24,18 @@ __all__ : typing.Sequence[str] = (
 ## code ##
 
 class Filter():
+    """Validate a `bsfs.query.ast.filter` query's structure and schema compliance.
+
+    * Conditions (Bounded, Value) can only be applied on literals
+    * Branches, Id, and Has can only be applied on nodes
+    * Predicates' domain and range must match
+    * Predicate paths must follow the schema
+    * Referenced types are present in the schema
+
+    """
+
+    # vertex types
+    T_VERTEX = typing.Union[bsc.Node, bsc.Literal] # FIXME: Shouldn't this be in the schema?
 
     # schema to validate against.
     schema: bsc.Schema
@@ -29,180 +43,182 @@ class Filter():
     def __init__(self, schema: bsc.Schema):
         self.schema = schema
 
-    def parse(self, node: ast.filter.FilterExpression, subject: bsc.types._Vertex):
-        # subject is a node type
-        if not isinstance(subject, bsc.Node):
-            raise errors.ConsistencyError(f'Expected a node, found {subject}')
-        # subject exists in the schema
-        if subject not in self.schema.nodes:
-            raise errors.ConsistencyError(f'Invalid node type {subject}')
-        # root expression is valid
-        self._parse(node, subject)
+    def __call__(self, root_type: bsc.Node, query: ast.filter.FilterExpression):
+        """Validate a filter *query*, assuming the subject having *root_type*.
+
+        Raises a `bsfs.utils.errors.ConsistencyError` if the query violates the schema.
+        Raises a `bsfs.utils.errors.BackendError` if the query structure is invalid.
+
+        """
+        # root_type must be a schema.Node
+        if not isinstance(root_type, bsc.Node):
+            raise TypeError(f'Expected a node, found {typename(root_type)}')
+        # root_type must exist in the schema
+        if root_type not in self.schema.nodes():
+            raise errors.ConsistencyError(f'{root_type} is not defined in the schema')
+        # check root expression
+        self._parse_filter_expression(root_type, query)
         # all tests passed
         return True
 
 
-    def _parse_numerical_expression(self, node: ast.filter.FilterExpression, subject: bsc.types._Vertex):
-        if isinstance(node, ast.filter.And):
-            return self._and(node, subject)
-        elif isinstance(node, ast.filter.Or):
-            return self._or(node, subject)
-        elif isinstance(node, ast.filter.LessThan):
-            return self._lessThan(node, subject)
-        elif isinstance(node, ast.filter.GreaterThan):
-            return self._greaterThan(node, subject)
-        elif isinstance(node, ast.filter.Equals):
-            return self._equals(node, subject, numerical=True)
-        else:
-            raise errors.ConsistencyError(f'Expected a numerical expression, found {node}')
-
-
-    def __branch(self, node: typing.Union[ast.filter.Any, ast.filter.And], subject: bsc.types._Vertex):
-        # subject is a node type
-        if not isinstance(subject, bsc.Node):
-            raise errors.ConsistencyError(f'Expected a node, found {subject}')
-        # subject exists in the schema
-        if subject not in self.schema.nodes:
-            raise errors.ConsistencyError(f'Invalid node type {subject}')
-        # predicate is valid
-        dom, rng = self._parse_predicate_expression(node.predicate)
-        # subject is a subtype of the predicate's domain
-        if not subject <= dom:
-            raise errors.ConsistencyError(f'Expected type {dom}, found {subject}')
-        # child expression is valid
-        self._parse_filter_expression(node.expr, rng)
+    ## routing methods
+
+    def _parse_filter_expression(self, type_: T_VERTEX, node: ast.filter.FilterExpression):
+        """Route *node* to the handler of the respective FilterExpression subclass."""
+        if isinstance(node, ast.filter.Is):
+            return self._is(type_, node)
+        if isinstance(node, ast.filter.Not):
+            return self._not(type_, node)
+        if isinstance(node, ast.filter.Has):
+            return self._has(type_, node)
+        if isinstance(node, (ast.filter.Any, ast.filter.All)):
+            return self._branch(type_, node)
+        if isinstance(node, (ast.filter.And, ast.filter.Or)):
+            return self._agg(type_, node)
+        if isinstance(node, (ast.filter.Equals, ast.filter.Substring, ast.filter.StartsWith, ast.filter.EndsWith)):
+            return self._value(type_, node)
+        if isinstance(node, (ast.filter.LessThan, ast.filter.GreaterThan)):
+            return self._bounded(type_, node)
+        # invalid node
+        raise errors.BackendError(f'expected filter expression, found {node}')
+
+    def _parse_predicate_expression(self, node: ast.filter.PredicateExpression) -> typing.Tuple[T_VERTEX, T_VERTEX]:
+        """Route *node* to the handler of the respective PredicateExpression subclass."""
+        if isinstance(node, ast.filter.Predicate):
+            return self._predicate(node)
+        if isinstance(node, ast.filter.OneOf):
+            return self._one_of(node)
+        # invalid node
+        raise errors.BackendError(f'expected predicate expression, found {node}')
+
+
+    ## predicate expressions
+
+    def _predicate(self, node: ast.filter.Predicate) -> typing.Tuple[T_VERTEX, T_VERTEX]:
+        # predicate exists in the schema
+        if not self.schema.has_predicate(node.predicate):
+            raise errors.ConsistencyError(f'predicate {node.predicate} is not in the schema')
+        # determine domain and range
+        pred = self.schema.predicate(node.predicate)
+        dom, rng = pred.domain, pred.range
+        if rng is None:
+            # FIXME: It is a design error that Predicates can have a None range...
+            raise errors.BackendError(f'predicate {pred} has no range')
+        if node.reverse:
+            dom, rng = rng, dom # type: ignore [assignment] # variable re-use confuses mypy
+        # return domain and range
+        return dom, rng
 
-    def _any(self, node: ast.filter.Any, subject: bsc.types._Vertex):
-        return self.__branch(node, subject)
+    def _one_of(self, node: ast.filter.OneOf) -> typing.Tuple[T_VERTEX, T_VERTEX]:
+        # determine domain and range types
+        # NOTE: select the most specific domain and the most generic range
+        dom, rng = None, None
+        for pred in node:
+            # parse child expression
+            subdom, subrng = self._parse_predicate_expression(pred)
+            try:
+                # determine overall domain
+                if dom is None or subdom < dom: # pick most specific domain
+                    dom = subdom
+                # domains must be related across all child expressions
+                if not subdom <= dom and not subdom >= dom:
+                    raise errors.ConsistencyError(f'domains {subdom} and {dom} are not related')
+            except TypeError as err: # compared literal vs. node
+                raise errors.ConsistencyError(f'domains {subdom} and {dom} are not of the same type') from err
 
-    def _all(self, node: ast.filter.All, subject: bsc.types._Vertex):
-        return self.__branch(node, subject)
+            try:
+                # determine overall range
+                if rng is None or subrng > rng: # pick most generic range
+                    rng = subrng
+                # ranges must be related across all child expressions
+                if not subrng <= rng and not subrng >= rng:
+                    raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related')
+            except TypeError as err: # compared literal vs. node
+                raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not of the same type') from err
+        # check domain and range
+        if dom is None or rng is None:
+            # OneOf guarantees at least one expression, these two cases cannot happen
+            raise errors.UnreachableError()
+        # return domain and range
+        return dom, rng
 
 
-    def __agg(self, node: typing.Union[ast.filter.And, ast.filter.Or], subject: bsc.types._Vertex):
+    ## intermediates
+
+    def _branch(self, type_: T_VERTEX, node: ast.filter._Branch):
+        # type is a Node
+        if not isinstance(type_, bsc.Node):
+            raise errors.ConsistencyError(f'expected a Node, found {type_}')
+        # type exists in the schema
+        # FIXME: Isn't it actually guaranteed that the type (except the root type) is part of the schema?
+        # all types can be traced back to (a) root_type, (b) predicate, or (c) manually set (e.g. in _is).
+        # For (a), we do (and have to) perform a check. For (c), the code base should be consistent throughout
+        # the module, so this is an assumption that has to be ensured in schema.Schema. For (b), we know (and
+        # check) that the predicate is in the schema, hence all node/literals derived from it are also in the
+        # schema by construction of the schema.Schema class. So, why do we check this every time?
+        if type_ not in self.schema.nodes():
+            raise errors.ConsistencyError(f'node {type_} is not in the schema')
+        # predicate is valid
+        dom, rng = self._parse_predicate_expression(node.predicate)
+        # type_ is a subtype of the predicate's domain
+        if not type_ <= dom:
+            raise errors.ConsistencyError(f'expected type {dom} or subtype thereof, found {type_}')
+        # child expression is valid
+        self._parse_filter_expression(rng, node.expr)
+
+    def _agg(self, type_: T_VERTEX, node: ast.filter._Agg):
         for expr in node:
             # child expression is valid
-            self._parse_filter_expression(expr, subject)
-
-    def _and(self, node: ast.filter.And, subject: bsc.types._Vertex):
-        return self.__agg(node, subject)
-
-    def _or(self, node: ast.filter.Or, subject: bsc.types._Vertex):
-        return self.__agg(node, subject)
-
+            self._parse_filter_expression(type_, expr)
 
-    def _not(self, node: ast.filter.Not, subject: bsc.types._Vertex):
+    def _not(self, type_: T_VERTEX, node: ast.filter.Not):
         # child expression is valid
-        self._parse_filter_expression(node.expr, subject)
-
-
-    def _has(self, node: ast.filter.Has, subject: bsc.types._Vertex):
-        # subject is a node type
-        if not isinstance(subject, bsc.Node):
-            raise errors.ConsistencyError(f'Expected a node, found {subject}')
-        # subject exists in the schema
-        if subject not in self.schema.nodes:
-            raise errors.ConsistencyError(f'Invalid node type {subject}')
+        self._parse_filter_expression(type_, node.expr)
+
+    def _has(self, type_: T_VERTEX, node: ast.filter.Has):
+        # type is a Node
+        if not isinstance(type_, bsc.Node):
+            raise errors.ConsistencyError(f'expected a Node, found {type_}')
+        # type exists in the schema
+        if type_ not in self.schema.nodes():
+            raise errors.ConsistencyError(f'node {type_} is not in the schema')
         # predicate is valid
-        dom, rng = self._parse_predicate_expression(node.predicate)
-        # subject is a subtype of the predicate's domain
-        if not subject <= dom:
-            raise errors.ConsistencyError(f'Expected type {dom}, found {subject}')
+        dom, _= self._parse_predicate_expression(node.predicate)
+        # type_ is a subtype of the predicate's domain
+        if not type_ <= dom:
+            raise errors.ConsistencyError(f'expected type {dom}, found {type_}')
         # node.count is a numerical expression
-        self._parse_numerical_expression(node.count, self.schema.literal(ns.xsd.numerical))
-
-
-    def _equals(self, node: ast.filter.Equals, subject: bsc.types._Vertex, numerical: bool = False):
-        # subject is a literal
-        #if not isinstance(subject, bsc.Literal):
-        #    raise errors.ConsistencyError(f'Expected a literal, found {subject}')
-        if isinstance(subject, bsc.Node):
-            # FIXME: How to handle this case?
-            # FIXME: How to check if a NodeType is acceptable?
-            # FIXME: Maybe use flags to control what is expected as node identifiers?
-            from bsfs.graph.nodes import Nodes # FIXME
-            if not isinstance(node.value, Nodes) and not isinstance(node.value, URI):
-                raise errors.ConsistencyError(f'Expected a Nodes or URI, found {node.value}')
-        elif isinstance(subject, bsc.Literal):
-            # literal exists in the schema
-            if subject not in self.schema.literals:
-                raise errors.ConsistencyError(f'Invalid literal type {subject}')
-        else:
-            # FIXME:
-            raise errors.ConsistencyError(f'Expected a literal, found {subject}')
-        # node.value is numeric (if requested)
-        if numerical and not isinstance(node.value, float) and not isinstance(node.value, int):
-            raise errors.ConsistencyError(f'Expected a numerical value (int or float), found {node.value}')
-        # NOTE: We cannot check if node.value agrees with the subject since we don't know
-        # all literal types, their hierarchy, and how the backend converts datatypes.
-
-
-    def _substring(self, node: ast.filter.Substring, subject: bsc.types._Vertex):
-        # subject is a literal
-        if not isinstance(subject, bsc.Literal):
-            raise errors.ConsistencyError(f'Expected a literal, found {subject}')
-        # literal exists in the schema
-        if subject not in self.schema.literals:
-            raise errors.ConsistencyError(f'Invalid literal type {subject}')
-        # node.value matches literal datatype
-        if not subject.is_a(ns.xsd.string):
-            raise errors.ConsistencyError(f'Expected a string literal, found {subject}')
-
-
-    def _lessThan(self, node: ast.filter.LessThan, subject: bsc.types._Vertex):
-        # subject is a literal
-        if not isinstance(subject, bsc.Literal):
-            raise errors.ConsistencyError(f'Expected a literal, found {subject}')
-        # literal exists in the schema
-        if subject not in self.schema.literals:
-            raise errors.ConsistencyError(f'Invalid literal type {subject}')
-        # subject is numerical
-        if not subject.is_a(ns.xsd.numerical):
-            raise errors.ConsistencyError(f'Expected a numerical literal, found {subject}')
-
-
-    def _greaterThan(self, node: ast.filter.GreaterThan, subject: bsc.types._Vertex):
-        # subject is a literal
-        if not isinstance(subject, bsc.Literal):
-            raise errors.ConsistencyError(f'Expected a literal, found {subject}')
-        # literal exists in the schema
-        if subject not in self.schema.literals:
-            raise errors.ConsistencyError(f'Invalid literal type {subject}')
-        # subject is numerical
-        if not subject.is_a(ns.xsd.numerical):
-            raise errors.ConsistencyError(f'Expected a numerical literal, found {subject}')
-
-
-    def _predicate(self, node: ast.filter.Predicate):
-        try:
-            # predicate exists in the schema
-            pred = self.schema.predicate(node.predicate)
-        except KeyError:
-            raise errors.ConsistencyError(f'') # FIXME
-        if node.reverse:
-            return pred.range, pred.domain
-        else:
-            return pred.domain, pred.range
-
+        # FIXME: We have to ensure that ns.xsd.integer is always known in the schema!
+        self._parse_filter_expression(self.schema.literal(ns.xsd.integer), node.count)
+
+
+    ## conditions
+
+    def _is(self, type_: T_VERTEX, node: ast.filter.Is): # pylint: disable=unused-argument # (node)
+        if not isinstance(type_, bsc.Node):
+            raise errors.ConsistencyError(f'expected a Node, found {type_}')
+        if type_ not in self.schema.nodes():
+            raise errors.ConsistencyError(f'node {type_} is not in the schema')
+
+    def _value(self, type_: T_VERTEX, node: ast.filter._Value): # pylint: disable=unused-argument # (node)
+        # type is a literal
+        if not isinstance(type_, bsc.Literal):
+            raise errors.ConsistencyError(f'expected a Literal, found {type_}')
+        # type exists in the schema
+        if type_ not in self.schema.literals():
+            raise errors.ConsistencyError(f'literal {type_} is not in the schema')
+        # FIXME: Check if node.value corresponds to type_
+        # FIXME: A specific literal might be requested (i.e., a numeric type when used in Has)
+
+    def _bounded(self, type_: T_VERTEX, node: ast.filter._Bounded): # pylint: disable=unused-argument # (node)
+        # type is a literal
+        if not isinstance(type_, bsc.Literal):
+            raise errors.ConsistencyError(f'expected a Literal, found {type_}')
+        # type exists in the schema
+        if type_ not in self.schema.literals():
+            raise errors.ConsistencyError(f'literal {type_} is not in the schema')
+        # FIXME: Check if node.value corresponds to type_
 
-    def _oneOf(self, node: ast.filter.OneOf):
-        dom, rng = None, None
-        for pred in node:
-            try:
-                # parse child expression
-                subdom, subrng = self._parse_predicate_expression(pred)
-                # domain and range must be related across all child expressions
-                if not subdom <= dom and not subdom >= dom:
-                    raise errors.ConsistencyError(f'') # FIXME
-                if not subrng <= rng and not subrng >= rng:
-                    raise errors.ConsistencyError(f'') # FIXME
-                # determine overall domain and range
-                if dom is None or subdom < dom: # pick most specific domain
-                    dom = subdom
-                if rng is None or subrng > rng: # pick most generic range
-                    rng = subrng
-            except KeyError:
-                raise errors.ConsistencyError(f'')
-        return dom, rng
 
 ## EOF ##
diff --git a/bsfs/utils/__init__.py b/bsfs/utils/__init__.py
index 94680ee..6737cef 100644
--- a/bsfs/utils/__init__.py
+++ b/bsfs/utils/__init__.py
@@ -9,7 +9,7 @@ import typing
 
 # inner-module imports
 from . import errors
-from .commons import typename
+from .commons import typename, normalize_args
 from .uri import URI
 from .uuid import UUID, UCID
 
@@ -19,6 +19,7 @@ __all__ : typing.Sequence[str] = (
     'URI',
     'UUID',
     'errors',
+    'normalize_args',
     'typename',
     )
 
diff --git a/bsfs/utils/commons.py b/bsfs/utils/commons.py
index bad2fe0..e9f0b7f 100644
--- a/bsfs/utils/commons.py
+++ b/bsfs/utils/commons.py
@@ -5,10 +5,12 @@ A copy of the license is provided with the project.
 Author: Matthias Baumgartner, 2022
 """
 # imports
+from collections import abc
 import typing
 
 # exports
 __all__: typing.Sequence[str] = (
+    'normalize_args',
     'typename',
     )
 
@@ -19,5 +21,37 @@ def typename(obj) -> str:
     """Return the type name of *obj*."""
     return type(obj).__name__
 
+# argument type in `normalize_args`.
+ArgType = typing.TypeVar('ArgType') # pylint: disable=invalid-name # type vars don't follow the usual convention
+
+def normalize_args(
+        *args: typing.Union[ArgType, typing.Iterable[ArgType], typing.Iterator[ArgType]]
+        ) -> typing.Tuple[ArgType, ...]:
+    """Arguments to a function can be passed as individual arguments, list-like
+    structures, or iterables. This function processes any of these styles and
+    returns a tuple of the respective items. Typically used within a function
+    provide a flexible interface but sill have parameters in a normalized form.
+
+    Examples:
+
+    >>> normalize_args(0,1,2)
+    (1,2,3)
+    >>> normalize_args([0,1,2])
+    (1,2,3)
+    >>> normalize_args(range(3))
+    (1,2,3)
+
+    """
+    if len(args) == 0: # foo()
+        return tuple()
+    if len(args) > 1: # foo(0, 1, 2)
+        return tuple(args) # type: ignore [arg-type] # we assume that argument styles (arg vs. iterable) are not mixed.
+    if isinstance(args[0], abc.Iterator): # foo(iter([0,1,2]))
+        return tuple(args[0])
+    if isinstance(args[0], abc.Iterable) and not isinstance(args[0], str): # foo([0, 1, 2])
+        return tuple(args[0])
+    # foo(0)
+    return (args[0], ) # type: ignore [return-value] # if args[0] is a str, we assume that ArgType was str.
+
 
 ## EOF ##
diff --git a/bsfs/utils/errors.py b/bsfs/utils/errors.py
index c5e8e16..be9d40e 100644
--- a/bsfs/utils/errors.py
+++ b/bsfs/utils/errors.py
@@ -38,4 +38,7 @@ class UnreachableError(ProgrammingError):
 class ConfigError(_BSFSError):
     """User config issue."""
 
+class BackendError(_BSFSError):
+    """Could not parse an AST structure."""
+
 ## EOF ##
-- 
cgit v1.2.3


From 73e39cb4967949025aefe874f401e27b0abb772c Mon Sep 17 00:00:00 2001
From: Matthias Baumgartner <dev@igsor.net>
Date: Thu, 22 Dec 2022 20:29:57 +0100
Subject: filter ast parser and get method in sparql store

---
 bsfs/triple_store/base.py                |   6 +-
 bsfs/triple_store/sparql/parse_filter.py | 307 +++++++++++++++++++++++++++++++
 bsfs/triple_store/sparql/sparql.py       |  51 ++++-
 3 files changed, 357 insertions(+), 7 deletions(-)
 create mode 100644 bsfs/triple_store/sparql/parse_filter.py

(limited to 'bsfs')

diff --git a/bsfs/triple_store/base.py b/bsfs/triple_store/base.py
index 5ff9523..7e03714 100644
--- a/bsfs/triple_store/base.py
+++ b/bsfs/triple_store/base.py
@@ -113,9 +113,11 @@ class TripleStoreBase(abc.ABC):
     def get(
             self,
             node_type: _schema.Node,
-            query: ast.filter.FilterExpression,
+            query: typing.Optional[ast.filter.FilterExpression] = None,
             ) -> typing.Iterator[URI]:
-        """Return guids of nodes of type *node_type* that match the *query*."""
+        """Return guids of nodes of type *node_type* that match the *query*.
+        Return all guids of the respective type if *query* is None.
+        """
 
     @abc.abstractmethod
     def exists(
diff --git a/bsfs/triple_store/sparql/parse_filter.py b/bsfs/triple_store/sparql/parse_filter.py
new file mode 100644
index 0000000..d4db0aa
--- /dev/null
+++ b/bsfs/triple_store/sparql/parse_filter.py
@@ -0,0 +1,307 @@
+"""
+
+Part of the BlackStar filesystem (bsfs) module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsfs imports
+from bsfs import schema as bsc
+from bsfs.namespace import ns
+from bsfs.query import ast
+from bsfs.utils import URI, errors
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Filter',
+    )
+
+class _GenHopName():
+    """Generator that produces a new unique symbol name with each iteration."""
+
+    # Symbol name prefix.
+    prefix: str
+
+    # Current counter.
+    curr: int
+
+    def __init__(self, prefix: str = '?hop', start: int = 0):
+        self.prefix = prefix
+        self.curr = start - 1
+
+    def __next__(self):
+        """Generate and return the next unique name."""
+        self.curr += 1
+        return self.prefix + str(self.curr)
+
+
+class Filter():
+    """Translate `bsfs.query.ast.filter` structures into Sparql queries."""
+
+    # Current schema to validate against.
+    schema: bsc.Schema
+
+    # Generator that produces unique symbol names.
+    ngen: _GenHopName
+
+    # Vertex type.
+    T_VERTEX = typing.Union[bsc.Node, bsc.Literal]
+
+    def __init__(self, schema):
+        self.schema = schema
+        self.ngen = _GenHopName()
+
+    def __call__(
+            self,
+            root_type: bsc.Node,
+            root: typing.Optional[ast.filter.FilterExpression] = None,
+            ) -> str:
+        """
+        """
+        # check root_type
+        if not isinstance(root_type, bsc.Node):
+            raise errors.BackendError(f'expected Node, found {root_type}')
+        if root_type not in self.schema.nodes():
+            raise errors.ConsistencyError(f'node {root_type} is not in the schema')
+        # parse root
+        if root is None:
+            cond = ''
+        else:
+            cond = self._parse_filter_expression(root_type, root, '?ent')
+        # assemble query
+        return f'''
+            SELECT ?ent
+            WHERE {{
+                ?ent <{ns.rdf.type}>/<{ns.rdfs.subClassOf}>* <{root_type.uri}> .
+                {cond}
+            }}
+            '''
+
+    def _parse_filter_expression(self, type_: T_VERTEX, node: ast.filter.FilterExpression, head: str) -> str:
+        """Route *node* to the handler of the respective FilterExpression subclass."""
+        if isinstance(node, ast.filter.Is):
+            return self._is(type_, node, head)
+        if isinstance(node, ast.filter.Not):
+            return self._not(type_, node, head)
+        if isinstance(node, ast.filter.Has):
+            return self._has(type_, node, head)
+        if isinstance(node, ast.filter.Any):
+            return self._any(type_, node, head)
+        if isinstance(node, ast.filter.All):
+            return self._all(type_, node, head)
+        if isinstance(node, ast.filter.And):
+            return self._and(type_, node, head)
+        if isinstance(node, ast.filter.Or):
+            return self._or(type_, node, head)
+        if isinstance(node, ast.filter.Equals):
+            return self._equals(type_, node, head)
+        if isinstance(node, ast.filter.Substring):
+            return self._substring(type_, node, head)
+        if isinstance(node, ast.filter.StartsWith):
+            return self._starts_with(type_, node, head)
+        if isinstance(node, ast.filter.EndsWith):
+            return self._ends_with(type_, node, head)
+        if isinstance(node, ast.filter.LessThan):
+            return self._less_than(type_, node, head)
+        if isinstance(node, ast.filter.GreaterThan):
+            return self._greater_than(type_, node, head)
+        # invalid node
+        raise errors.BackendError(f'expected filter expression, found {node}')
+
+    def _parse_predicate_expression(
+            self,
+            type_: T_VERTEX,
+            node: ast.filter.PredicateExpression
+            ) -> typing.Tuple[str, T_VERTEX]:
+        """Route *node* to the handler of the respective PredicateExpression subclass."""
+        if isinstance(node, ast.filter.Predicate):
+            return self._predicate(type_, node)
+        if isinstance(node, ast.filter.OneOf):
+            return self._one_of(type_, node)
+        # invalid node
+        raise errors.BackendError(f'expected predicate expression, found {node}')
+
+    def _one_of(self, node_type: T_VERTEX, node: ast.filter.OneOf) -> typing.Tuple[str, T_VERTEX]:
+        """
+        """
+        if not isinstance(node_type, bsc.Node):
+            raise errors.BackendError(f'expected Node, found {node_type}')
+        # walk through predicates
+        suburi, rng = set(), None
+        for pred in node: # OneOf guarantees at least one expression
+            puri, subrng = self._parse_predicate_expression(node_type, pred)
+            # track predicate uris
+            suburi.add(puri)
+            try:
+                # check for more generic range
+                if rng is None or subrng > rng:
+                    rng = subrng
+                # check range consistency
+                if not subrng <= rng and not subrng >= rng:
+                    raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related')
+            except TypeError as err: # subrng and rng are not comparable
+                raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related') from err
+        if rng is None:
+            # for mypy to be certain of the rng type
+            # if rng were None, we'd have gotten a TypeError above (None > None)
+            raise errors.UnreachableError()
+        # return joint predicate expression and next range
+        return '|'.join(suburi), rng
+
+    def _predicate(self, node_type: T_VERTEX, node: ast.filter.Predicate) -> typing.Tuple[str, T_VERTEX]:
+        """
+        """
+        # check node_type
+        if not isinstance(node_type, bsc.Node):
+            raise errors.BackendError(f'expected Node, found {node_type}')
+        # fetch predicate and its uri
+        puri = node.predicate
+        # get and check predicate, domain, and range
+        if not self.schema.has_predicate(puri):
+            raise errors.ConsistencyError(f'predicate {puri} is not in the schema')
+        pred = self.schema.predicate(puri)
+        if pred.range is None:
+            # FIXME: It is a design error that Predicates can have a None range...
+            raise errors.BackendError(f'predicate {pred} has no range')
+        dom, rng = pred.domain, pred.range
+        # encapsulate predicate uri
+        puri = f'<{puri}>' # type: ignore [assignment] # variable re-use confuses mypy
+        # apply reverse flag
+        if node.reverse:
+            puri = URI('^' + puri)
+            dom, rng = rng, dom # type: ignore [assignment] # variable re-use confuses mypy
+        # check path consistency
+        if not node_type <= dom:
+            raise errors.ConsistencyError(f'expected type {dom} or subtype thereof, found {node_type}')
+        # return predicate URI and next node type
+        return puri, rng
+
+    def _any(self, node_type: T_VERTEX, node: ast.filter.Any, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Node):
+            raise errors.BackendError(f'expected Node, found {node_type}')
+        # parse predicate
+        pred, next_type = self._parse_predicate_expression(node_type, node.predicate)
+        # parse expression
+        nexthead = next(self.ngen)
+        expr = self._parse_filter_expression(next_type, node.expr, nexthead)
+        # combine results
+        return f'{head} {pred} {nexthead} . {expr}'
+
+    def _all(self, node_type: T_VERTEX, node: ast.filter.All, head: str) -> str:
+        """
+        """
+        # NOTE: All(P, E) := Not(Any(P, Not(E))) and EXISTS(P, ?)
+        if not isinstance(node_type, bsc.Node):
+            raise errors.BackendError(f'expected Node, found {node_type}')
+        # parse rewritten ast
+        expr = self._parse_filter_expression(node_type,
+            ast.filter.Not(
+                ast.filter.Any(node.predicate,
+                    ast.filter.Not(node.expr))), head)
+        # parse predicate for existence constraint
+        pred, _ = self._parse_predicate_expression(node_type, node.predicate)
+        temphead = next(self.ngen)
+        # return existence and rewritten expression
+        return f'FILTER EXISTS {{ {head} {pred} {temphead} }} . ' + expr
+
+    def _and(self, node_type: T_VERTEX, node: ast.filter.And, head: str) -> str:
+        """
+        """
+        sub = [self._parse_filter_expression(node_type, expr, head) for expr in node]
+        return ' . '.join(sub)
+
+    def _or(self, node_type: T_VERTEX, node: ast.filter.Or, head: str) -> str:
+        """
+        """
+        # potential special case optimization:
+        # * ast: Or(Equals('foo'), Equals('bar'), ...)
+        # * query: VALUES ?head { "value1"^^<...> "value2"^^<...> "value3"^<...> ... }
+        sub = [self._parse_filter_expression(node_type, expr, head) for expr in node]
+        sub = ['{' + expr + '}' for expr in sub]
+        return ' UNION '.join(sub)
+
+    def _not(self, node_type: T_VERTEX, node: ast.filter.Not, head: str) -> str:
+        """
+        """
+        expr = self._parse_filter_expression(node_type, node.expr, head)
+        if isinstance(node_type, bsc.Literal):
+            return f'MINUS {{ {expr} }}'
+        # NOTE: for bsc.Node types, we must include at least one expression in the body of MINUS,
+        # otherwise the connection between the context and body of MINUS is lost.
+        # The simplest (and non-interfering) choice is a type statement.
+        return f'MINUS {{ {head} <{ns.rdf.type}>/<{ns.rdfs.subClassOf}>* <{node_type.uri}> . {expr} }}'
+
+    def _has(self, node_type: T_VERTEX, node: ast.filter.Has, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Node):
+            raise errors.BackendError(f'expected Node, found {node_type}')
+        # parse predicate
+        pred, _ = self._parse_predicate_expression(node_type, node.predicate)
+        # get new heads
+        inner = next(self.ngen)
+        outer = next(self.ngen)
+        # predicate count expression (fetch number of predicates at *head*)
+        num_preds = f'{{ SELECT (COUNT(distinct {inner}) as {outer}) WHERE {{ {head} {pred} {inner} }} }}'
+        # count expression
+        # FIXME: We have to ensure that ns.xsd.integer is always known in the schema!
+        count_bounds = self._parse_filter_expression(self.schema.literal(ns.xsd.integer), node.count, outer)
+        # combine
+        return num_preds + ' . ' + count_bounds
+
+    def _is(self, node_type: T_VERTEX, node: ast.filter.Is, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Node):
+            raise errors.BackendError(f'expected Node, found {node_type}')
+        return f'VALUES {head} {{ <{node.value}> }}'
+
+    def _equals(self, node_type: T_VERTEX, node: ast.filter.Equals, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Literal):
+            raise errors.BackendError(f'expected Literal, found {node}')
+        return f'VALUES {head} {{ "{node.value}"^^<{node_type.uri}> }}'
+
+    def _substring(self, node_type: T_VERTEX, node: ast.filter.Substring, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Literal):
+            raise errors.BackendError(f'expected Literal, found {node_type}')
+        return f'FILTER contains(str({head}), "{node.value}")'
+
+    def _starts_with(self, node_type: T_VERTEX, node: ast.filter.StartsWith, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Literal):
+            raise errors.BackendError(f'expected Literal, found {node_type}')
+        return f'FILTER strstarts(str({head}), "{node.value}")'
+
+    def _ends_with(self, node_type: T_VERTEX, node: ast.filter.EndsWith, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Literal):
+            raise errors.BackendError(f'expected Literal, found {node_type}')
+        return f'FILTER strends(str({head}), "{node.value}")'
+
+    def _less_than(self, node_type: T_VERTEX, node: ast.filter.LessThan, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Literal):
+            raise errors.BackendError(f'expected Literal, found {node_type}')
+        equality = '=' if not node.strict else ''
+        return f'FILTER ({head} <{equality} {float(node.threshold)})'
+
+    def _greater_than(self, node_type: T_VERTEX, node: ast.filter.GreaterThan, head: str) -> str:
+        """
+        """
+        if not isinstance(node_type, bsc.Literal):
+            raise errors.BackendError(f'expected Literal, found {node_type}')
+        equality = '=' if not node.strict else ''
+        return f'FILTER ({head} >{equality} {float(node.threshold)})'
+
+## EOF ##
diff --git a/bsfs/triple_store/sparql/sparql.py b/bsfs/triple_store/sparql/sparql.py
index 7172f34..c3cbff6 100644
--- a/bsfs/triple_store/sparql/sparql.py
+++ b/bsfs/triple_store/sparql/sparql.py
@@ -15,6 +15,7 @@ from bsfs.query import ast
 from bsfs.utils import errors, URI
 
 # inner-module imports
+from . import parse_filter
 from .. import base
 
 
@@ -86,11 +87,15 @@ class SparqlStore(base.TripleStoreBase):
     # The local schema.
     _schema: bsc.Schema
 
+    # Filter parser
+    _filter_parser: parse_filter.Filter
+
     def __init__(self):
         super().__init__(None)
         self._graph = rdflib.Graph()
         self._transaction = _Transaction(self._graph)
         self._schema = bsc.Schema.Empty()
+        self._filter_parser = parse_filter.Filter(self._schema)
 
     # NOTE: mypy and pylint complain about the **kwargs not being listed (contrasting super)
     # However, not having it here is clearer since it's explicit that there are no arguments.
@@ -127,10 +132,17 @@ class SparqlStore(base.TripleStoreBase):
         # get deleted classes
         sub = self.schema - schema
 
-        # remove predicate instances
         for pred in sub.predicates:
+            # remove predicate instances
             for src, trg in self._graph.subject_objects(rdflib.URIRef(pred.uri)):
                 self._transaction.remove((src, rdflib.URIRef(pred.uri), trg))
+            # remove predicate definition
+            if pred.parent is not None:
+                self._transaction.remove((
+                    rdflib.URIRef(pred.uri),
+                    rdflib.RDFS.subClassOf,
+                    rdflib.URIRef(pred.parent.uri),
+                    ))
 
         # remove node instances
         for node in sub.nodes:
@@ -144,17 +156,46 @@ class SparqlStore(base.TripleStoreBase):
                     self._transaction.remove((inst, pred, trg))
                 # remove instance
                 self._transaction.remove((inst, rdflib.RDF.type, rdflib.URIRef(node.uri)))
-
-        # NOTE: Nothing to do for literals
+            # remove node definition
+            if node.parent is not None:
+                self._transaction.remove((
+                    rdflib.URIRef(node.uri),
+                    rdflib.RDFS.subClassOf,
+                    rdflib.URIRef(node.parent.uri),
+                    ))
+
+        for lit in sub.literals:
+            # remove literal definition
+            if lit.parent is not None:
+                self._transaction.remove((
+                    rdflib.URIRef(lit.uri),
+                    rdflib.RDFS.subClassOf,
+                    rdflib.URIRef(lit.parent.uri),
+                    ))
+
+        # add predicate, node, and literal hierarchies to the graph
+        for itm in itertools.chain(schema.predicates(), schema.nodes(), schema.literals()):
+            if itm.parent is not None:
+                self._transaction.add((rdflib.URIRef(itm.uri), rdflib.RDFS.subClassOf, rdflib.URIRef(itm.parent.uri)))
 
         # commit instance changes
         self.commit()
 
         # migrate schema
         self._schema = schema
+        self._filter_parser.schema = schema
 
-    def get(self, node_type: bsc.Node, query: ast.filter.FilterExpression) -> typing.Iterator[URI]:
-        raise NotImplementedError()
+    def get(
+            self,
+            node_type: bsc.Node,
+            query: typing.Optional[ast.filter.FilterExpression] = None,
+            ) -> typing.Iterator[URI]:
+        if node_type not in self.schema.nodes():
+            raise errors.ConsistencyError(f'{node_type} is not defined in the schema')
+        if not isinstance(query, ast.filter.FilterExpression):
+            raise TypeError(query)
+        for guid, in self._graph.query(self._filter_parser(node_type, query)):
+            yield URI(guid)
 
     def _has_type(self, subject: URI, node_type: bsc.Node) -> bool:
         """Return True if *subject* is a node of class *node_type* or a subclass thereof."""
-- 
cgit v1.2.3


From ca7ee6c59d2eb3f4ec4d16e392d12d946cd85e4d Mon Sep 17 00:00:00 2001
From: Matthias Baumgartner <dev@igsor.net>
Date: Thu, 22 Dec 2022 20:33:00 +0100
Subject: filter-ast based get interface in graph.

* Graph interface: Graph.get added
* Node instance resolver so that Nodes can be used in a filter ast
* AC interface: filter_read added to interface
* upstream test adjustments of previous sparql store changes
---
 bsfs/graph/ac/base.py |   4 ++
 bsfs/graph/ac/null.py |   5 ++
 bsfs/graph/graph.py   |  28 +++++++--
 bsfs/graph/resolve.py | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 193 insertions(+), 5 deletions(-)
 create mode 100644 bsfs/graph/resolve.py

(limited to 'bsfs')

diff --git a/bsfs/graph/ac/base.py b/bsfs/graph/ac/base.py
index bc9aeb3..0703e2e 100644
--- a/bsfs/graph/ac/base.py
+++ b/bsfs/graph/ac/base.py
@@ -10,6 +10,7 @@ import typing
 
 # bsfs imports
 from bsfs import schema
+from bsfs.query import ast
 from bsfs.triple_store import TripleStoreBase
 from bsfs.utils import URI
 
@@ -67,5 +68,8 @@ class AccessControlBase(abc.ABC):
     def createable(self, node_type: schema.Node, guids: typing.Iterable[URI]) -> typing.Iterable[URI]:
         """Return nodes that are allowed to be created."""
 
+    @abc.abstractmethod
+    def filter_read(self, node_type: schema.Node, query: ast.filter.FilterExpression) -> ast.filter.FilterExpression:
+        """Re-write a filter *query* to get (i.e., read) *node_type* nodes."""
 
 ## EOF ##
diff --git a/bsfs/graph/ac/null.py b/bsfs/graph/ac/null.py
index 36838bd..12b4e87 100644
--- a/bsfs/graph/ac/null.py
+++ b/bsfs/graph/ac/null.py
@@ -10,6 +10,7 @@ import typing
 # bsfs imports
 from bsfs import schema
 from bsfs.namespace import ns
+from bsfs.query import ast
 from bsfs.utils import URI
 
 # inner-module imports
@@ -49,4 +50,8 @@ class NullAC(base.AccessControlBase):
         """Return nodes that are allowed to be created."""
         return guids
 
+    def filter_read(self, node_type: schema.Node, query: ast.filter.FilterExpression) -> ast.filter.FilterExpression:
+        """Re-write a filter *query* to get (i.e., read) *node_type* nodes."""
+        return query
+
 ## EOF ##
diff --git a/bsfs/graph/graph.py b/bsfs/graph/graph.py
index 51fe75d..f030fed 100644
--- a/bsfs/graph/graph.py
+++ b/bsfs/graph/graph.py
@@ -9,13 +9,15 @@ import os
 import typing
 
 # bsfs imports
-from bsfs.query import ast
+from bsfs.query import ast, validate
 from bsfs.schema import Schema
 from bsfs.triple_store import TripleStoreBase
 from bsfs.utils import URI, typename
 
 # inner-module imports
+from . import ac
 from . import nodes as _nodes
+from . import resolve
 
 # exports
 __all__: typing.Sequence[str] = (
@@ -44,6 +46,9 @@ class Graph():
     def __init__(self, backend: TripleStoreBase, user: URI):
         self._backend = backend
         self._user = user
+        self._resolver = resolve.Filter(self._backend.schema)
+        self._validate = validate.Filter(self._backend.schema)
+        self._ac = ac.NullAC(self._backend, self._user)
         # ensure Graph schema requirements
         self.migrate(self._backend.schema)
 
@@ -85,6 +90,9 @@ class Graph():
         # migrate schema in backend
         # FIXME: consult access controls!
         self._backend.schema = schema
+        # re-initialize members
+        self._resolver.schema = self.schema
+        self._validate.schema = self.schema
         # return self
         return self
 
@@ -108,11 +116,21 @@ class Graph():
         *node_type*) once some data is assigned to them.
 
         """
-        type_ = self.schema.node(node_type)
-        return _nodes.Nodes(self._backend, self._user, type_, {guid})
+        return self.nodes(node_type, {guid})
 
-    def get(self, node_type: URI, subject: ast.filter.FilterExpression) -> _nodes.Nodes:
+    def get(self, node_type: URI, query: ast.filter.FilterExpression) -> _nodes.Nodes: # FIXME: How about empty query?
         """Return a `Nodes` instance over all nodes of type *node_type* that match the *subject* query."""
-        raise NotImplementedError()
+        # get node type
+        type_ = self.schema.node(node_type)
+        # resolve Nodes instances
+        query = self._resolver(type_, query)
+        # add access controls to query
+        query = self._ac.filter_read(type_, query)
+        # validate query
+        self._validate(type_, query)
+        # query the backend
+        guids = self._backend.get(type_, query) # no need to materialize
+        # return Nodes instance
+        return _nodes.Nodes(self._backend, self._user, type_, guids)
 
 ## EOF ##
diff --git a/bsfs/graph/resolve.py b/bsfs/graph/resolve.py
new file mode 100644
index 0000000..feb0855
--- /dev/null
+++ b/bsfs/graph/resolve.py
@@ -0,0 +1,161 @@
+"""
+
+Part of the BlackStar filesystem (bsfs) module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsfs imports
+from bsfs import schema as bsc
+from bsfs.query import ast
+from bsfs.utils import errors
+
+# inner-module imports
+from . import nodes
+
+# exports
+__all__: typing.Sequence[str] = (
+    'Filter',
+    )
+
+
+## code ##
+
+class Filter():
+    """Rewrites the query to replace `bsfs.graph.nodes.Nodes` instances with the respective URI.
+    Does only limited type checking and schema validation.
+    Use `bsfs.schema.validate.Filter` to do so.
+
+    Example:
+        input:  Any(ns.bse.tag, Is(Nodes(...)))
+        output: Any(ns.bse.tag, Or(Is(...), Is(...), ...)))
+
+    >>> tags = graph.node(ns.bsfs.Tag, 'http://example.com/me/tag#1234')
+    >>> graph.get(ns.bsfs.Entity, ast.filter.Any(ns.bse.tag, ast.filter.Is(tags)))
+
+    """
+
+    T_VERTEX = typing.Union[bsc.Node, bsc.Literal]
+
+    def __init__(self, schema):
+        self.schema = schema
+
+    def __call__(self, root_type: bsc.Node, node: ast.filter.FilterExpression):
+        return self._parse_filter_expression(root_type, node)
+
+    def _parse_filter_expression(
+            self,
+            type_: T_VERTEX,
+            node: ast.filter.FilterExpression,
+            ) -> ast.filter.FilterExpression:
+        """Route *node* to the handler of the respective FilterExpression subclass."""
+        if isinstance(node, ast.filter.Is):
+            return self._is(type_, node)
+        if isinstance(node, ast.filter.Not):
+            return self._not(type_, node)
+        if isinstance(node, ast.filter.Has):
+            return self._has(type_, node)
+        if isinstance(node, ast.filter.Any):
+            return self._any(type_, node)
+        if isinstance(node, ast.filter.All):
+            return self._all(type_, node)
+        if isinstance(node, ast.filter.And):
+            return self._and(type_, node)
+        if isinstance(node, ast.filter.Or):
+            return self._or(type_, node)
+        if isinstance(node, (ast.filter.Equals, ast.filter.Substring, \
+                             ast.filter.StartsWith, ast.filter.EndsWith)):
+            return self._value(type_, node)
+        if isinstance(node, (ast.filter.LessThan, ast.filter.GreaterThan)):
+            return self._bounded(type_, node)
+        # invalid node
+        raise errors.BackendError(f'expected filter expression, found {node}')
+
+    def _parse_predicate_expression(self, node: ast.filter.PredicateExpression) -> T_VERTEX:
+        """Route *node* to the handler of the respective PredicateExpression subclass."""
+        if isinstance(node, ast.filter.Predicate):
+            return self._predicate(node)
+        if isinstance(node, ast.filter.OneOf):
+            return self._one_of(node)
+        # invalid node
+        raise errors.BackendError(f'expected predicate expression, found {node}')
+
+    def _predicate(self, node: ast.filter.Predicate) -> T_VERTEX:
+        if not self.schema.has_predicate(node.predicate):
+            raise errors.ConsistencyError(f'predicate {node.predicate} is not in the schema')
+        pred = self.schema.predicate(node.predicate)
+        dom, rng = pred.domain, pred.range
+        if node.reverse:
+            dom, rng = rng, dom
+        return rng
+
+    def _one_of(self, node: ast.filter.OneOf) -> T_VERTEX:
+        # determine domain and range types
+        rng = None
+        for pred in node:
+            # parse child expression
+            subrng = self._parse_predicate_expression(pred)
+            # determine the next type
+            try:
+                if rng is None or subrng > rng: # pick most generic range
+                    rng = subrng
+            except TypeError as err:
+                raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related') from err
+        if rng is None:
+            raise errors.UnreachableError()
+        return rng
+
+    def _any(self, type_: T_VERTEX, node: ast.filter.Any) -> ast.filter.Any: # pylint: disable=unused-argument
+        next_type = self._parse_predicate_expression(node.predicate)
+        return ast.filter.Any(node.predicate, self._parse_filter_expression(next_type, node.expr))
+
+    def _all(self, type_: T_VERTEX, node: ast.filter.All) -> ast.filter.All: # pylint: disable=unused-argument
+        next_type = self._parse_predicate_expression(node.predicate)
+        return ast.filter.All(node.predicate, self._parse_filter_expression(next_type, node.expr))
+
+    def _and(self, type_: T_VERTEX, node: ast.filter.And) -> ast.filter.And:
+        return ast.filter.And({self._parse_filter_expression(type_, expr) for expr in node})
+
+    def _or(self, type_: T_VERTEX, node: ast.filter.Or) -> ast.filter.Or:
+        return ast.filter.Or({self._parse_filter_expression(type_, expr) for expr in node})
+
+    def _not(self, type_: T_VERTEX, node: ast.filter.Not) -> ast.filter.Not:
+        return ast.filter.Not(self._parse_filter_expression(type_, node.expr))
+
+    def _has(self, type_: T_VERTEX, node: ast.filter.Has) -> ast.filter.Has: # pylint: disable=unused-argument
+        return node
+
+    def _value(self, type_: T_VERTEX, node: ast.filter._Value) -> ast.filter._Value: # pylint: disable=unused-argument
+        return node
+
+    def _bounded(self, type_: T_VERTEX, node: ast.filter._Bounded) -> ast.filter._Bounded: # pylint: disable=unused-argument
+        return node
+
+    def _is(self, type_: T_VERTEX, node: ast.filter.Is) -> typing.Union[ast.filter.Or, ast.filter.Is]:
+        # check if action is needed
+        if not isinstance(node.value, nodes.Nodes):
+            return node
+        # check schema consistency
+        if node.value.node_type not in self.schema.nodes():
+            raise errors.ConsistencyError(f'node {node.value.node_type} is not in the schema')
+        # check type compatibility
+        if not isinstance(type_, bsc.Node):
+            raise errors.ConsistencyError(f'expected a node, found {type_}')
+        if not node.value.node_type <= type_:
+            raise errors.ConsistencyError(f'expected type {type_} or subtype thereof, found {node.value.node_type}')
+        # NOTE: We assume that the node type is checked when writing to the backend.
+        # Links to any of the guids can therefore only exist if the type matches.
+        # Hence, we don't add a type check/constrain here.
+        return ast.filter.Or(ast.filter.Is(guid) for guid in node.value.guids)
+        # optimized code, removing unnecessary ast.filter.Or
+        #guids = set(node.value.guids)
+        #if len(guids) == 0:
+        #    raise errors.BackendError(f'')
+        #if len(guids) == 1:
+        #    return ast.filter.Nodeid(next(iter(guids)))
+        #return ast.filter.Or(ast.filter.Is(guid) for guid in guids)
+
+
+## EOF ##
-- 
cgit v1.2.3