From 791918039979d0743fd2ea4b9a5e74593ff96fd0 Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Mon, 19 Dec 2022 13:32:34 +0100 Subject: query ast file structures and essential interfaces --- bsfs/graph/graph.py | 5 + bsfs/query/__init__.py | 20 +++ bsfs/query/ast/__init__.py | 24 ++++ bsfs/query/ast/filter_.py | 30 ++++ bsfs/query/validator.py | 35 +++++ bsfs/triple_store/base.py | 8 ++ bsfs/triple_store/sparql.py | 253 ---------------------------------- bsfs/triple_store/sparql/__init__.py | 18 +++ bsfs/triple_store/sparql/sparql.py | 256 +++++++++++++++++++++++++++++++++++ 9 files changed, 396 insertions(+), 253 deletions(-) create mode 100644 bsfs/query/__init__.py create mode 100644 bsfs/query/ast/__init__.py create mode 100644 bsfs/query/ast/filter_.py create mode 100644 bsfs/query/validator.py delete mode 100644 bsfs/triple_store/sparql.py create mode 100644 bsfs/triple_store/sparql/__init__.py create mode 100644 bsfs/triple_store/sparql/sparql.py (limited to 'bsfs') diff --git a/bsfs/graph/graph.py b/bsfs/graph/graph.py index b7b9f1c..10e5904 100644 --- a/bsfs/graph/graph.py +++ b/bsfs/graph/graph.py @@ -9,6 +9,7 @@ import os import typing # bsfs imports +from bsfs.query import ast from bsfs.schema import Schema from bsfs.triple_store import TripleStoreBase from bsfs.utils import URI, typename @@ -110,4 +111,8 @@ class Graph(): type_ = self.schema.node(node_type) return _nodes.Nodes(self._backend, self._user, type_, {guid}) + def get(self, node_type: URI, subject: ast.filter.FilterExpression) -> Nodes: + """Return a `Nodes` instance over all nodes of type *node_type* that match the *subject* query.""" + raise NotImplementedError() + ## EOF ## diff --git a/bsfs/query/__init__.py b/bsfs/query/__init__.py new file mode 100644 index 0000000..21c7389 --- /dev/null +++ b/bsfs/query/__init__.py @@ -0,0 +1,20 @@ +""" + +Part of the BlackStar filesystem (bsfs) module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from . import ast +from . import validator as validate + +# exports +__all__: typing.Sequence[str] = ( + 'ast', + 'validate', + ) + +## EOF ## diff --git a/bsfs/query/ast/__init__.py b/bsfs/query/ast/__init__.py new file mode 100644 index 0000000..0ee7385 --- /dev/null +++ b/bsfs/query/ast/__init__.py @@ -0,0 +1,24 @@ +"""Query AST components. + +The query AST consists of a Filter syntax tree. + +Classes beginning with an underscore (_) represent internal type hierarchies +and should not be used for parsing. Note that the AST structures do not +(and cannot) check semantic validity or consistency with a given schema. + +Part of the BlackStar filesystem (bsfs) module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from . import filter_ as filter + +# exports +__all__: typing.Sequence[str] = ( + 'filter', + ) + +## EOF ## diff --git a/bsfs/query/ast/filter_.py b/bsfs/query/ast/filter_.py new file mode 100644 index 0000000..4086fc1 --- /dev/null +++ b/bsfs/query/ast/filter_.py @@ -0,0 +1,30 @@ +"""Filter AST. + +Part of the BlackStar filesystem (bsfs) module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +from collections import abc +import typing + +# exports +__all__ : typing.Sequence[str] = [] + + +## code ## + +class _Expression(abc.Hashable): + def __repr__(self) -> str: + """Return the expressions's string representation.""" + return f'{typename(self)}()' + + def __hash__(self) -> int: + """Return the expression's integer representation.""" + return hash(type(self)) + + def __eq__(self, other: typing.Any) -> bool: + """Return True if *self* and *other* are equivalent.""" + return isinstance(other, type(self)) + +## EOF ## diff --git a/bsfs/query/validator.py b/bsfs/query/validator.py new file mode 100644 index 0000000..ac3789a --- /dev/null +++ b/bsfs/query/validator.py @@ -0,0 +1,35 @@ +""" + +Part of the BlackStar filesystem (bsfs) module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# bsfs imports +from bsfs import schema as bsc + +# inner-module imports +from . import ast + +# exports +__all__ : typing.Sequence[str] = ( + 'Filter', + ) + + +## code ## + +class Filter(): + + # schema to validate against. + schema: bsc.Schema + + def __init__(self, schema: bsc.Schema): + self.schema = schema + + def parse(self, node: ast.filter.FilterExpression): + raise NotImplementedError() + +## EOF ## diff --git a/bsfs/triple_store/base.py b/bsfs/triple_store/base.py index 6561262..28ebb86 100644 --- a/bsfs/triple_store/base.py +++ b/bsfs/triple_store/base.py @@ -108,6 +108,14 @@ class TripleStoreBase(abc.ABC): """ + @abc.abstractmethod + def get( + self, + node_type: bsc.Node, + query: ast.filter.FilterExpression, + ) -> typing.Iterator[URI]: + """Return guids of nodes of type *node_type* that match the *query*.""" + @abc.abstractmethod def exists( self, diff --git a/bsfs/triple_store/sparql.py b/bsfs/triple_store/sparql.py deleted file mode 100644 index 7516dff..0000000 --- a/bsfs/triple_store/sparql.py +++ /dev/null @@ -1,253 +0,0 @@ -""" - -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports -import itertools -import typing -import rdflib - -# bsfs imports -from bsfs import schema as bsc -from bsfs.utils import errors, URI - -# inner-module imports -from . import base - - -# exports -__all__: typing.Sequence[str] = ( - 'SparqlStore', - ) - - -## code ## - -class _Transaction(): - """Lightweight rdflib transactions for in-memory databases.""" - - # graph instance. - _graph: rdflib.Graph - - # current log of added triples. - _added: typing.List[typing.Any] - - # current log of removed triples. - _removed: typing.List[typing.Any] - - def __init__(self, graph: rdflib.Graph): - self._graph = graph - # initialize internal structures - self.commit() - - def commit(self): - """Commit temporary changes.""" - self._added = [] - self._removed = [] - - def rollback(self): - """Undo changes since the last commit.""" - for triple in self._added: - self._graph.remove(triple) - for triple in self._removed: - self._graph.add(triple) - - def add(self, triple: typing.Any): - """Add a triple to the graph.""" - if triple not in self._graph: - self._added.append(triple) - self._graph.add(triple) - - def remove(self, triple: typing.Any): - """Remove a triple from the graph.""" - if triple in self._graph: - self._removed.append(triple) - self._graph.remove(triple) - - -class SparqlStore(base.TripleStoreBase): - """Sparql-based triple store. - - The sparql triple store uses a third-party backend - (currently rdflib) to store triples and manages them via - the Sparql query language. - - """ - - # The rdflib graph. - _graph: rdflib.Graph - - # Current transaction. - _transaction: _Transaction - - # The local schema. - _schema: bsc.Schema - - def __init__(self): - super().__init__(None) - self._graph = rdflib.Graph() - self._transaction = _Transaction(self._graph) - self._schema = bsc.Schema.Empty() - - # NOTE: mypy and pylint complain about the **kwargs not being listed (contrasting super) - # However, not having it here is clearer since it's explicit that there are no arguments. - @classmethod - def Open(cls) -> 'SparqlStore': # type: ignore [override] # pylint: disable=arguments-differ - return cls() - - def commit(self): - self._transaction.commit() - - def rollback(self): - self._transaction.rollback() - - @property - def schema(self) -> bsc.Schema: - return self._schema - - @schema.setter - def schema(self, schema: bsc.Schema): - # check args: Schema instanace - if not isinstance(schema, bsc.Schema): - raise TypeError(schema) - # check compatibility: No contradicting definitions - if not self.schema.consistent_with(schema): - raise errors.ConsistencyError(f'{schema} is inconsistent with {self.schema}') - - # commit the current transaction - self.commit() - - # adjust instances: - # nothing to do for added classes - # delete instances of removed classes - - # get deleted classes - sub = self.schema - schema - - # remove predicate instances - for pred in sub.predicates: - for src, trg in self._graph.subject_objects(rdflib.URIRef(pred.uri)): - self._transaction.remove((src, rdflib.URIRef(pred.uri), trg)) - - # remove node instances - for node in sub.nodes: - # iterate through node instances - for inst in self._graph.subjects(rdflib.RDF.type, rdflib.URIRef(node.uri)): - # remove triples where the instance is in the object position - for src, pred in self._graph.subject_predicates(inst): - self._transaction.remove((src, pred, inst)) - # remove triples where the instance is in the subject position - for pred, trg in self._graph.predicate_objects(inst): - self._transaction.remove((inst, pred, trg)) - # remove instance - self._transaction.remove((inst, rdflib.RDF.type, rdflib.URIRef(node.uri))) - - # NOTE: Nothing to do for literals - - # commit instance changes - self.commit() - - # migrate schema - self._schema = schema - - - def _has_type(self, subject: URI, node_type: bsc.Node) -> bool: - """Return True if *subject* is a node of class *node_type* or a subclass thereof.""" - if node_type not in self.schema.nodes(): - raise errors.ConsistencyError(f'{node_type} is not defined in the schema') - - subject_types = list(self._graph.objects(rdflib.URIRef(subject), rdflib.RDF.type)) - if len(subject_types) == 0: - return False - if len(subject_types) == 1: - node = self.schema.node(URI(subject_types[0])) # type: ignore [arg-type] # URI is a subtype of str - if node == node_type: - return True - if node_type in node.parents(): - return True - return False - raise errors.UnreachableError() - - def exists( - self, - node_type: bsc.Node, - guids: typing.Iterable[URI], - ) -> typing.Iterable[URI]: - return (subj for subj in guids if self._has_type(subj, node_type)) - - def create( - self, - node_type: bsc.Node, - guids: typing.Iterable[URI], - ): - # check node_type - if node_type not in self.schema.nodes(): - raise errors.ConsistencyError(f'{node_type} is not defined in the schema') - # check and create guids - for guid in guids: - subject = rdflib.URIRef(guid) - # check node existence - if (subject, rdflib.RDF.type, None) in self._graph: - # FIXME: node exists and may have a different type! ignore? raise? report? - continue - # add node - self._transaction.add((subject, rdflib.RDF.type, rdflib.URIRef(node_type.uri))) - - def set( - self, - node_type: bsc.Node, - guids: typing.Iterable[URI], - predicate: bsc.Predicate, - values: typing.Iterable[typing.Any], - ): - # check node_type - if node_type not in self.schema.nodes(): - raise errors.ConsistencyError(f'{node_type} is not defined in the schema') - # check predicate - if predicate not in self.schema.predicates(): - raise errors.ConsistencyError(f'{predicate} is not defined in the schema') - if not node_type <= predicate.domain: - raise errors.ConsistencyError(f'{node_type} must be a subclass of {predicate.domain}') - # NOTE: predicate.range is in the schema since predicate is in the schema. - # materialize values - values = set(values) - # check values - if len(values) == 0: - return - if predicate.unique and len(values) != 1: - raise ValueError(values) - if isinstance(predicate.range, bsc.Node): - values = set(values) # materialize to safeguard against iterators passed as argument - inconsistent = {val for val in values if not self._has_type(val, predicate.range)} - # catches nodes that don't exist and nodes that have an inconsistent type - if len(inconsistent) > 0: - raise errors.InstanceError(inconsistent) - # check guids - # FIXME: Fail or skip inexistent nodes? - guids = set(guids) - inconsistent = {guid for guid in guids if not self._has_type(guid, node_type)} - if len(inconsistent) > 0: - raise errors.InstanceError(inconsistent) - - # add triples - pred = rdflib.URIRef(predicate.uri) - for guid, value in itertools.product(guids, values): - guid = rdflib.URIRef(guid) - # convert value - if isinstance(predicate.range, bsc.Literal): - value = rdflib.Literal(value, datatype=rdflib.URIRef(predicate.range.uri)) - elif isinstance(predicate.range, bsc.Node): - value = rdflib.URIRef(value) - else: - raise errors.UnreachableError() - # clear triples for unique predicates - if predicate.unique: - for obj in self._graph.objects(guid, pred): - if obj != value: - self._transaction.remove((guid, pred, obj)) - # add triple - self._transaction.add((guid, pred, value)) - -## EOF ## diff --git a/bsfs/triple_store/sparql/__init__.py b/bsfs/triple_store/sparql/__init__.py new file mode 100644 index 0000000..285334a --- /dev/null +++ b/bsfs/triple_store/sparql/__init__.py @@ -0,0 +1,18 @@ +""" + +Part of the BlackStar filesystem (bsfs) module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from .sparql import SparqlStore + +# exports +__all__: typing.Sequence[str] = ( + 'SparqlStore', + ) + +## EOF ## diff --git a/bsfs/triple_store/sparql/sparql.py b/bsfs/triple_store/sparql/sparql.py new file mode 100644 index 0000000..fff540a --- /dev/null +++ b/bsfs/triple_store/sparql/sparql.py @@ -0,0 +1,256 @@ +""" + +Part of the BlackStar filesystem (bsfs) module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import itertools +import typing +import rdflib + +# bsfs imports +from bsfs import schema as bsc +from bsfs.query import ast +from bsfs.utils import errors, URI + +# inner-module imports +from . import base + + +# exports +__all__: typing.Sequence[str] = ( + 'SparqlStore', + ) + + +## code ## + +class _Transaction(): + """Lightweight rdflib transactions for in-memory databases.""" + + # graph instance. + _graph: rdflib.Graph + + # current log of added triples. + _added: typing.List[typing.Any] + + # current log of removed triples. + _removed: typing.List[typing.Any] + + def __init__(self, graph: rdflib.Graph): + self._graph = graph + # initialize internal structures + self.commit() + + def commit(self): + """Commit temporary changes.""" + self._added = [] + self._removed = [] + + def rollback(self): + """Undo changes since the last commit.""" + for triple in self._added: + self._graph.remove(triple) + for triple in self._removed: + self._graph.add(triple) + + def add(self, triple: typing.Any): + """Add a triple to the graph.""" + if triple not in self._graph: + self._added.append(triple) + self._graph.add(triple) + + def remove(self, triple: typing.Any): + """Remove a triple from the graph.""" + if triple in self._graph: + self._removed.append(triple) + self._graph.remove(triple) + + +class SparqlStore(base.TripleStoreBase): + """Sparql-based triple store. + + The sparql triple store uses a third-party backend + (currently rdflib) to store triples and manages them via + the Sparql query language. + + """ + + # The rdflib graph. + _graph: rdflib.Graph + + # Current transaction. + _transaction: _Transaction + + # The local schema. + _schema: bsc.Schema + + def __init__(self): + super().__init__(None) + self._graph = rdflib.Graph() + self._transaction = _Transaction(self._graph) + self._schema = bsc.Schema.Empty() + + # NOTE: mypy and pylint complain about the **kwargs not being listed (contrasting super) + # However, not having it here is clearer since it's explicit that there are no arguments. + @classmethod + def Open(cls) -> 'SparqlStore': # type: ignore [override] # pylint: disable=arguments-differ + return cls() + + def commit(self): + self._transaction.commit() + + def rollback(self): + self._transaction.rollback() + + @property + def schema(self) -> bsc.Schema: + return self._schema + + @schema.setter + def schema(self, schema: bsc.Schema): + # check args: Schema instanace + if not isinstance(schema, bsc.Schema): + raise TypeError(schema) + # check compatibility: No contradicting definitions + if not self.schema.consistent_with(schema): + raise errors.ConsistencyError(f'{schema} is inconsistent with {self.schema}') + + # commit the current transaction + self.commit() + + # adjust instances: + # nothing to do for added classes + # delete instances of removed classes + + # get deleted classes + sub = self.schema - schema + + # remove predicate instances + for pred in sub.predicates: + for src, trg in self._graph.subject_objects(rdflib.URIRef(pred.uri)): + self._transaction.remove((src, rdflib.URIRef(pred.uri), trg)) + + # remove node instances + for node in sub.nodes: + # iterate through node instances + for inst in self._graph.subjects(rdflib.RDF.type, rdflib.URIRef(node.uri)): + # remove triples where the instance is in the object position + for src, pred in self._graph.subject_predicates(inst): + self._transaction.remove((src, pred, inst)) + # remove triples where the instance is in the subject position + for pred, trg in self._graph.predicate_objects(inst): + self._transaction.remove((inst, pred, trg)) + # remove instance + self._transaction.remove((inst, rdflib.RDF.type, rdflib.URIRef(node.uri))) + + # NOTE: Nothing to do for literals + + # commit instance changes + self.commit() + + # migrate schema + self._schema = schema + + def get(self, node_type: bsc.Node, query: ast.filter.FilterExpression) -> typing.Iterator[URI]: + raise NotImplementedError() + + def _has_type(self, subject: URI, node_type: bsc.Node) -> bool: + """Return True if *subject* is a node of class *node_type* or a subclass thereof.""" + if node_type not in self.schema.nodes(): + raise errors.ConsistencyError(f'{node_type} is not defined in the schema') + + subject_types = list(self._graph.objects(rdflib.URIRef(subject), rdflib.RDF.type)) + if len(subject_types) == 0: + return False + if len(subject_types) == 1: + node = self.schema.node(URI(subject_types[0])) # type: ignore [arg-type] # URI is a subtype of str + if node == node_type: + return True + if node_type in node.parents(): + return True + return False + raise errors.UnreachableError() + + def exists( + self, + node_type: bsc.Node, + guids: typing.Iterable[URI], + ) -> typing.Iterable[URI]: + return (subj for subj in guids if self._has_type(subj, node_type)) + + def create( + self, + node_type: bsc.Node, + guids: typing.Iterable[URI], + ): + # check node_type + if node_type not in self.schema.nodes(): + raise errors.ConsistencyError(f'{node_type} is not defined in the schema') + # check and create guids + for guid in guids: + subject = rdflib.URIRef(guid) + # check node existence + if (subject, rdflib.RDF.type, None) in self._graph: + # FIXME: node exists and may have a different type! ignore? raise? report? + continue + # add node + self._transaction.add((subject, rdflib.RDF.type, rdflib.URIRef(node_type.uri))) + + def set( + self, + node_type: bsc.Node, + guids: typing.Iterable[URI], + predicate: bsc.Predicate, + values: typing.Iterable[typing.Any], + ): + # check node_type + if node_type not in self.schema.nodes(): + raise errors.ConsistencyError(f'{node_type} is not defined in the schema') + # check predicate + if predicate not in self.schema.predicates(): + raise errors.ConsistencyError(f'{predicate} is not defined in the schema') + if not node_type <= predicate.domain: + raise errors.ConsistencyError(f'{node_type} must be a subclass of {predicate.domain}') + # NOTE: predicate.range is in the schema since predicate is in the schema. + # materialize values + values = set(values) + # check values + if len(values) == 0: + return + if predicate.unique and len(values) != 1: + raise ValueError(values) + if isinstance(predicate.range, bsc.Node): + values = set(values) # materialize to safeguard against iterators passed as argument + inconsistent = {val for val in values if not self._has_type(val, predicate.range)} + # catches nodes that don't exist and nodes that have an inconsistent type + if len(inconsistent) > 0: + raise errors.InstanceError(inconsistent) + # check guids + # FIXME: Fail or skip inexistent nodes? + guids = set(guids) + inconsistent = {guid for guid in guids if not self._has_type(guid, node_type)} + if len(inconsistent) > 0: + raise errors.InstanceError(inconsistent) + + # add triples + pred = rdflib.URIRef(predicate.uri) + for guid, value in itertools.product(guids, values): + guid = rdflib.URIRef(guid) + # convert value + if isinstance(predicate.range, bsc.Literal): + value = rdflib.Literal(value, datatype=rdflib.URIRef(predicate.range.uri)) + elif isinstance(predicate.range, bsc.Node): + value = rdflib.URIRef(value) + else: + raise errors.UnreachableError() + # clear triples for unique predicates + if predicate.unique: + for obj in self._graph.objects(guid, pred): + if obj != value: + self._transaction.remove((guid, pred, obj)) + # add triple + self._transaction.add((guid, pred, value)) + +## EOF ## -- cgit v1.2.3 From a0f2308adcb226d28de3355bc7115a6d9b669462 Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Mon, 19 Dec 2022 13:40:02 +0100 Subject: import fixes --- bsfs/graph/graph.py | 2 +- bsfs/query/validator.py | 177 ++++++++++++++++++++++++++++++++++++- bsfs/triple_store/base.py | 3 +- bsfs/triple_store/sparql/sparql.py | 2 +- 4 files changed, 179 insertions(+), 5 deletions(-) (limited to 'bsfs') diff --git a/bsfs/graph/graph.py b/bsfs/graph/graph.py index 10e5904..51fe75d 100644 --- a/bsfs/graph/graph.py +++ b/bsfs/graph/graph.py @@ -111,7 +111,7 @@ class Graph(): type_ = self.schema.node(node_type) return _nodes.Nodes(self._backend, self._user, type_, {guid}) - def get(self, node_type: URI, subject: ast.filter.FilterExpression) -> Nodes: + def get(self, node_type: URI, subject: ast.filter.FilterExpression) -> _nodes.Nodes: """Return a `Nodes` instance over all nodes of type *node_type* that match the *subject* query.""" raise NotImplementedError() diff --git a/bsfs/query/validator.py b/bsfs/query/validator.py index ac3789a..123b947 100644 --- a/bsfs/query/validator.py +++ b/bsfs/query/validator.py @@ -29,7 +29,180 @@ class Filter(): def __init__(self, schema: bsc.Schema): self.schema = schema - def parse(self, node: ast.filter.FilterExpression): - raise NotImplementedError() + def parse(self, node: ast.filter.FilterExpression, subject: bsc.types._Vertex): + # subject is a node type + if not isinstance(subject, bsc.Node): + raise errors.ConsistencyError(f'Expected a node, found {subject}') + # subject exists in the schema + if subject not in self.schema.nodes: + raise errors.ConsistencyError(f'Invalid node type {subject}') + # root expression is valid + self._parse(node, subject) + # all tests passed + return True + + + def _parse_numerical_expression(self, node: ast.filter.FilterExpression, subject: bsc.types._Vertex): + if isinstance(node, ast.filter.And): + return self._and(node, subject) + elif isinstance(node, ast.filter.Or): + return self._or(node, subject) + elif isinstance(node, ast.filter.LessThan): + return self._lessThan(node, subject) + elif isinstance(node, ast.filter.GreaterThan): + return self._greaterThan(node, subject) + elif isinstance(node, ast.filter.Equals): + return self._equals(node, subject, numerical=True) + else: + raise errors.ConsistencyError(f'Expected a numerical expression, found {node}') + + + def __branch(self, node: typing.Union[ast.filter.Any, ast.filter.And], subject: bsc.types._Vertex): + # subject is a node type + if not isinstance(subject, bsc.Node): + raise errors.ConsistencyError(f'Expected a node, found {subject}') + # subject exists in the schema + if subject not in self.schema.nodes: + raise errors.ConsistencyError(f'Invalid node type {subject}') + # predicate is valid + dom, rng = self._parse_predicate_expression(node.predicate) + # subject is a subtype of the predicate's domain + if not subject <= dom: + raise errors.ConsistencyError(f'Expected type {dom}, found {subject}') + # child expression is valid + self._parse_filter_expression(node.expr, rng) + + def _any(self, node: ast.filter.Any, subject: bsc.types._Vertex): + return self.__branch(node, subject) + + def _all(self, node: ast.filter.All, subject: bsc.types._Vertex): + return self.__branch(node, subject) + + + def __agg(self, node: typing.Union[ast.filter.And, ast.filter.Or], subject: bsc.types._Vertex): + for expr in node: + # child expression is valid + self._parse_filter_expression(expr, subject) + + def _and(self, node: ast.filter.And, subject: bsc.types._Vertex): + return self.__agg(node, subject) + + def _or(self, node: ast.filter.Or, subject: bsc.types._Vertex): + return self.__agg(node, subject) + + + def _not(self, node: ast.filter.Not, subject: bsc.types._Vertex): + # child expression is valid + self._parse_filter_expression(node.expr, subject) + + + def _has(self, node: ast.filter.Has, subject: bsc.types._Vertex): + # subject is a node type + if not isinstance(subject, bsc.Node): + raise errors.ConsistencyError(f'Expected a node, found {subject}') + # subject exists in the schema + if subject not in self.schema.nodes: + raise errors.ConsistencyError(f'Invalid node type {subject}') + # predicate is valid + dom, rng = self._parse_predicate_expression(node.predicate) + # subject is a subtype of the predicate's domain + if not subject <= dom: + raise errors.ConsistencyError(f'Expected type {dom}, found {subject}') + # node.count is a numerical expression + self._parse_numerical_expression(node.count, self.schema.literal(ns.xsd.numerical)) + + + def _equals(self, node: ast.filter.Equals, subject: bsc.types._Vertex, numerical: bool = False): + # subject is a literal + #if not isinstance(subject, bsc.Literal): + # raise errors.ConsistencyError(f'Expected a literal, found {subject}') + if isinstance(subject, bsc.Node): + # FIXME: How to handle this case? + # FIXME: How to check if a NodeType is acceptable? + # FIXME: Maybe use flags to control what is expected as node identifiers? + from bsfs.graph.nodes import Nodes # FIXME + if not isinstance(node.value, Nodes) and not isinstance(node.value, URI): + raise errors.ConsistencyError(f'Expected a Nodes or URI, found {node.value}') + elif isinstance(subject, bsc.Literal): + # literal exists in the schema + if subject not in self.schema.literals: + raise errors.ConsistencyError(f'Invalid literal type {subject}') + else: + # FIXME: + raise errors.ConsistencyError(f'Expected a literal, found {subject}') + # node.value is numeric (if requested) + if numerical and not isinstance(node.value, float) and not isinstance(node.value, int): + raise errors.ConsistencyError(f'Expected a numerical value (int or float), found {node.value}') + # NOTE: We cannot check if node.value agrees with the subject since we don't know + # all literal types, their hierarchy, and how the backend converts datatypes. + + + def _substring(self, node: ast.filter.Substring, subject: bsc.types._Vertex): + # subject is a literal + if not isinstance(subject, bsc.Literal): + raise errors.ConsistencyError(f'Expected a literal, found {subject}') + # literal exists in the schema + if subject not in self.schema.literals: + raise errors.ConsistencyError(f'Invalid literal type {subject}') + # node.value matches literal datatype + if not subject.is_a(ns.xsd.string): + raise errors.ConsistencyError(f'Expected a string literal, found {subject}') + + + def _lessThan(self, node: ast.filter.LessThan, subject: bsc.types._Vertex): + # subject is a literal + if not isinstance(subject, bsc.Literal): + raise errors.ConsistencyError(f'Expected a literal, found {subject}') + # literal exists in the schema + if subject not in self.schema.literals: + raise errors.ConsistencyError(f'Invalid literal type {subject}') + # subject is numerical + if not subject.is_a(ns.xsd.numerical): + raise errors.ConsistencyError(f'Expected a numerical literal, found {subject}') + + + def _greaterThan(self, node: ast.filter.GreaterThan, subject: bsc.types._Vertex): + # subject is a literal + if not isinstance(subject, bsc.Literal): + raise errors.ConsistencyError(f'Expected a literal, found {subject}') + # literal exists in the schema + if subject not in self.schema.literals: + raise errors.ConsistencyError(f'Invalid literal type {subject}') + # subject is numerical + if not subject.is_a(ns.xsd.numerical): + raise errors.ConsistencyError(f'Expected a numerical literal, found {subject}') + + + def _predicate(self, node: ast.filter.Predicate): + try: + # predicate exists in the schema + pred = self.schema.predicate(node.predicate) + except KeyError: + raise errors.ConsistencyError(f'') # FIXME + if node.reverse: + return pred.range, pred.domain + else: + return pred.domain, pred.range + + + def _oneOf(self, node: ast.filter.OneOf): + dom, rng = None, None + for pred in node: + try: + # parse child expression + subdom, subrng = self._parse_predicate_expression(pred) + # domain and range must be related across all child expressions + if not subdom <= dom and not subdom >= dom: + raise errors.ConsistencyError(f'') # FIXME + if not subrng <= rng and not subrng >= rng: + raise errors.ConsistencyError(f'') # FIXME + # determine overall domain and range + if dom is None or subdom < dom: # pick most specific domain + dom = subdom + if rng is None or subrng > rng: # pick most generic range + rng = subrng + except KeyError: + raise errors.ConsistencyError(f'') + return dom, rng ## EOF ## diff --git a/bsfs/triple_store/base.py b/bsfs/triple_store/base.py index 28ebb86..5ff9523 100644 --- a/bsfs/triple_store/base.py +++ b/bsfs/triple_store/base.py @@ -9,6 +9,7 @@ import abc import typing # inner-module imports +from bsfs.query import ast from bsfs.utils import URI, typename import bsfs.schema as _schema @@ -111,7 +112,7 @@ class TripleStoreBase(abc.ABC): @abc.abstractmethod def get( self, - node_type: bsc.Node, + node_type: _schema.Node, query: ast.filter.FilterExpression, ) -> typing.Iterator[URI]: """Return guids of nodes of type *node_type* that match the *query*.""" diff --git a/bsfs/triple_store/sparql/sparql.py b/bsfs/triple_store/sparql/sparql.py index fff540a..7172f34 100644 --- a/bsfs/triple_store/sparql/sparql.py +++ b/bsfs/triple_store/sparql/sparql.py @@ -15,7 +15,7 @@ from bsfs.query import ast from bsfs.utils import errors, URI # inner-module imports -from . import base +from .. import base # exports -- cgit v1.2.3 From 383fa8fd5c2e4b67089b4c5b654ebade51382f2c Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Thu, 22 Dec 2022 20:27:49 +0100 Subject: filter ast definition and validation --- bsfs/query/ast/__init__.py | 2 +- bsfs/query/ast/filter_.py | 405 ++++++++++++++++++++++++++++++++++++++++++++- bsfs/query/validator.py | 336 +++++++++++++++++++------------------ bsfs/utils/__init__.py | 3 +- bsfs/utils/commons.py | 34 ++++ bsfs/utils/errors.py | 3 + 6 files changed, 620 insertions(+), 163 deletions(-) (limited to 'bsfs') diff --git a/bsfs/query/ast/__init__.py b/bsfs/query/ast/__init__.py index 0ee7385..704d051 100644 --- a/bsfs/query/ast/__init__.py +++ b/bsfs/query/ast/__init__.py @@ -14,7 +14,7 @@ Author: Matthias Baumgartner, 2022 import typing # inner-module imports -from . import filter_ as filter +from . import filter_ as filter # pylint: disable=redefined-builtin # exports __all__: typing.Sequence[str] = ( diff --git a/bsfs/query/ast/filter_.py b/bsfs/query/ast/filter_.py index 4086fc1..b129ded 100644 --- a/bsfs/query/ast/filter_.py +++ b/bsfs/query/ast/filter_.py @@ -1,5 +1,27 @@ """Filter AST. +Note that it is easily possible to construct an AST that is inconsistent with +a given schema. Furthermore, it is possible to construct a semantically invalid +AST which that cannot be parsed correctly or includes contradicting statements. +The AST nodes do not (and cannot) check such issues. + +For example, consider the following AST: + +>>> Any(ns.bse.collection, +... And( +... Equals('hello'), +... Any(ns.bsm.guid, Any(ns.bsm.guid, Equals('hello'))), +... Any(ns.bst.label, Equals('world')), +... All(ns.bst.label, Not(Equals('world'))), +... ) +... ) + +This AST has multiple issues that are not verified upon its creation: +* A condition on a non-literal. +* A Filter on a literal. +* Conditions exclude each other +* The predicate along the branch have incompatible domains and ranges. + Part of the BlackStar filesystem (bsfs) module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 @@ -8,12 +30,45 @@ Author: Matthias Baumgartner, 2022 from collections import abc import typing +# bsfs imports +from bsfs.utils import URI, typename, normalize_args + +# inner-module imports +#from . import utils + # exports -__all__ : typing.Sequence[str] = [] +__all__ : typing.Sequence[str] = ( + # base classes + 'FilterExpression', + 'PredicateExpression', + # predicate expressions + 'OneOf', + 'Predicate', + # branching + 'All', + 'Any', + # aggregators + 'And', + 'Or', + # value matchers + 'Equals', + 'Substring', + 'EndsWith', + 'StartsWith', + # range matchers + 'GreaterThan', + 'LessThan', + # misc + 'Has', + 'Is', + 'Not', + ) ## code ## +# pylint: disable=too-few-public-methods # Many expressions use mostly magic methods + class _Expression(abc.Hashable): def __repr__(self) -> str: """Return the expressions's string representation.""" @@ -27,4 +82,352 @@ class _Expression(abc.Hashable): """Return True if *self* and *other* are equivalent.""" return isinstance(other, type(self)) + +class FilterExpression(_Expression): + """Generic Filter expression.""" + + +class PredicateExpression(_Expression): + """Generic Predicate expression.""" + + +class _Branch(FilterExpression): + """Branch the filter along a predicate.""" + + # predicate to follow. + predicate: PredicateExpression + + # child expression to evaluate. + expr: FilterExpression + + def __init__( + self, + predicate: typing.Union[PredicateExpression, URI], + expr: FilterExpression, + ): + # process predicate argument + if isinstance(predicate, URI): + predicate = Predicate(predicate) + elif not isinstance(predicate, PredicateExpression): + raise TypeError(predicate) + # process expression argument + if not isinstance(expr, FilterExpression): + raise TypeError(expr) + # assign members + self.predicate = predicate + self.expr = expr + + def __repr__(self) -> str: + return f'{typename(self)}({self.predicate}, {self.expr})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.predicate, self.expr)) + + def __eq__(self, other) -> bool: + return super().__eq__(other) \ + and self.predicate == other.predicate \ + and self.expr == other.expr + +class Any(_Branch): + """Any (and at least one) triple matches.""" + + +class All(_Branch): + """All (and at least one) triples match.""" + + +class _Agg(FilterExpression, abc.Collection): + """Combine multiple expressions.""" + + # child expressions + expr: typing.Set[FilterExpression] + + def __init__( + self, + *expr: typing.Union[FilterExpression, + typing.Iterable[FilterExpression], + typing.Iterator[FilterExpression]] + ): + # unfold arguments + unfolded = set(normalize_args(*expr)) + # check type + if not all(isinstance(e, FilterExpression) for e in unfolded): + raise TypeError(expr) + # assign member + self.expr = unfolded + + def __contains__(self, expr: typing.Any) -> bool: + """Return True if *expr* is among the child expressions.""" + return expr in self.expr + + def __iter__(self) -> typing.Iterator[FilterExpression]: + """Iterator over child expressions.""" + return iter(self.expr) + + def __len__(self) -> int: + """Number of child expressions.""" + return len(self.expr) + + def __repr__(self) -> str: + return f'{typename(self)}({self.expr})' + + def __hash__(self) -> int: + return hash((super().__hash__(), tuple(self.expr))) # FIXME: Unique hash of different orders over self.expr + + def __eq__(self, other) -> bool: + return super().__eq__(other) and self.expr == other.expr + + +class And(_Agg): + """All conditions match.""" + + +class Or(_Agg): + """At least one condition matches.""" + + +class Not(FilterExpression): + """Invert a statement.""" + + # child expression + expr: FilterExpression + + def __init__(self, expr: FilterExpression): + # check argument + if not isinstance(expr, FilterExpression): + raise TypeError(expr) + # assign member + self.expr = expr + + def __repr__(self) -> str: + return f'{typename(self)}({self.expr})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.expr)) + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) and self.expr == other.expr + + +class Has(FilterExpression): + """Has predicate N times""" + + # predicate to follow. + predicate: PredicateExpression + + # target count + count: FilterExpression + + def __init__( + self, + predicate: typing.Union[PredicateExpression, URI], + count: typing.Optional[typing.Union[FilterExpression, int]] = None, + ): + # check predicate + if isinstance(predicate, URI): + predicate = Predicate(predicate) + elif not isinstance(predicate, PredicateExpression): + raise TypeError(predicate) + # check count + if count is None: + count = GreaterThan(1, strict=False) + elif isinstance(count, int): + count = Equals(count) + elif not isinstance(count, FilterExpression): + raise TypeError(count) + # assign members + self.predicate = predicate + self.count = count + + def __repr__(self) -> str: + return f'{typename(self)}({self.predicate}, {self.count})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.predicate, self.count)) + + def __eq__(self, other) -> bool: + return super().__eq__(other) \ + and self.predicate == other.predicate \ + and self.count == other.count + + +class _Value(FilterExpression): + """ + """ + + # target value. + value: typing.Any + + def __init__(self, value: typing.Any): + self.value = value + + def __repr__(self) -> str: + return f'{typename(self)}({self.value})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.value)) + + def __eq__(self, other) -> bool: + return super().__eq__(other) and self.value == other.value + + +class Is(_Value): + """Match the URI of a node.""" + + +class Equals(_Value): + """Value matches exactly. + NOTE: Value format must correspond to literal type; can be a string, a number, or a Node + """ + + +class Substring(_Value): + """Value matches a substring + NOTE: value format must be a string + """ + + +class StartsWith(_Value): + """Value begins with a given string.""" + + +class EndsWith(_Value): + """Value ends with a given string.""" + + +class _Bounded(FilterExpression): + """ + """ + + # bound. + threshold: float + + # closed (True) or open (False) bound. + strict: bool + + def __init__( + self, + threshold: float, + strict: bool = True, + ): + self.threshold = float(threshold) + self.strict = bool(strict) + + def __repr__(self) -> str: + return f'{typename(self)}({self.threshold}, {self.strict})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.threshold, self.strict)) + + def __eq__(self, other) -> bool: + return super().__eq__(other) \ + and self.threshold == other.threshold \ + and self.strict == other.strict + + + +class LessThan(_Bounded): + """Value is (strictly) smaller than threshold. + NOTE: only on numerical literals + """ + + +class GreaterThan(_Bounded): + """Value is (strictly) larger than threshold + NOTE: only on numerical literals + """ + + +class Predicate(PredicateExpression): + """A single predicate.""" + + # predicate URI + predicate: URI + + # reverse the predicate's direction + reverse: bool + + def __init__( + self, + predicate: URI, + reverse: typing.Optional[bool] = False, + ): + # check arguments + if not isinstance(predicate, URI): + raise TypeError(predicate) + # assign members + self.predicate = predicate + self.reverse = bool(reverse) + + def __repr__(self) -> str: + return f'{typename(self)}({self.predicate}, {self.reverse})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.predicate, self.reverse)) + + def __eq__(self, other) -> bool: + return super().__eq__(other) \ + and self.predicate == other.predicate \ + and self.reverse == other.reverse + + +class OneOf(PredicateExpression, abc.Collection): + """A set of predicate alternatives. + + The predicates' domains must be ascendants or descendants of each other. + The overall domain is the most specific one. + + The predicate's domains must be ascendants or descendants of each other. + The overall range is the most generic one. + """ + + # predicate alternatives + expr: typing.Set[PredicateExpression] + + def __init__(self, *expr: typing.Union[PredicateExpression, URI]): + # unfold arguments + unfolded = set(normalize_args(*expr)) # type: ignore [arg-type] # this is getting too complex... + # check arguments + if len(unfolded) == 0: + raise AttributeError('expected at least one expression, found none') + # ensure PredicateExpression + unfolded = {Predicate(e) if isinstance(e, URI) else e for e in unfolded} + # check type + if not all(isinstance(e, PredicateExpression) for e in unfolded): + raise TypeError(expr) + # assign member + self.expr = unfolded + + def __contains__(self, expr: typing.Any) -> bool: + """Return True if *expr* is among the child expressions.""" + return expr in self.expr + + def __iter__(self) -> typing.Iterator[PredicateExpression]: + """Iterator over child expressions.""" + return iter(self.expr) + + def __len__(self) -> int: + """Number of child expressions.""" + return len(self.expr) + + def __repr__(self) -> str: + return f'{typename(self)}({self.expr})' + + def __hash__(self) -> int: + return hash((super().__hash__(), tuple(self.expr))) # FIXME: Unique hash of different orders over self.expr + + def __eq__(self, other) -> bool: + return super().__eq__(other) and self.expr == other.expr + + +# Helpers + +def IsIn(*values): # pylint: disable=invalid-name # explicitly mimics an expression + """Match any of the given URIs.""" + return Or(Is(value) for value in normalize_args(*values)) + +def IsNotIn(*values): # pylint: disable=invalid-name # explicitly mimics an expression + """Match none of the given URIs.""" + return Not(IsIn(*values)) + ## EOF ## diff --git a/bsfs/query/validator.py b/bsfs/query/validator.py index 123b947..352203a 100644 --- a/bsfs/query/validator.py +++ b/bsfs/query/validator.py @@ -9,6 +9,8 @@ import typing # bsfs imports from bsfs import schema as bsc +from bsfs.namespace import ns +from bsfs.utils import errors, typename # inner-module imports from . import ast @@ -22,6 +24,18 @@ __all__ : typing.Sequence[str] = ( ## code ## class Filter(): + """Validate a `bsfs.query.ast.filter` query's structure and schema compliance. + + * Conditions (Bounded, Value) can only be applied on literals + * Branches, Id, and Has can only be applied on nodes + * Predicates' domain and range must match + * Predicate paths must follow the schema + * Referenced types are present in the schema + + """ + + # vertex types + T_VERTEX = typing.Union[bsc.Node, bsc.Literal] # FIXME: Shouldn't this be in the schema? # schema to validate against. schema: bsc.Schema @@ -29,180 +43,182 @@ class Filter(): def __init__(self, schema: bsc.Schema): self.schema = schema - def parse(self, node: ast.filter.FilterExpression, subject: bsc.types._Vertex): - # subject is a node type - if not isinstance(subject, bsc.Node): - raise errors.ConsistencyError(f'Expected a node, found {subject}') - # subject exists in the schema - if subject not in self.schema.nodes: - raise errors.ConsistencyError(f'Invalid node type {subject}') - # root expression is valid - self._parse(node, subject) + def __call__(self, root_type: bsc.Node, query: ast.filter.FilterExpression): + """Validate a filter *query*, assuming the subject having *root_type*. + + Raises a `bsfs.utils.errors.ConsistencyError` if the query violates the schema. + Raises a `bsfs.utils.errors.BackendError` if the query structure is invalid. + + """ + # root_type must be a schema.Node + if not isinstance(root_type, bsc.Node): + raise TypeError(f'Expected a node, found {typename(root_type)}') + # root_type must exist in the schema + if root_type not in self.schema.nodes(): + raise errors.ConsistencyError(f'{root_type} is not defined in the schema') + # check root expression + self._parse_filter_expression(root_type, query) # all tests passed return True - def _parse_numerical_expression(self, node: ast.filter.FilterExpression, subject: bsc.types._Vertex): - if isinstance(node, ast.filter.And): - return self._and(node, subject) - elif isinstance(node, ast.filter.Or): - return self._or(node, subject) - elif isinstance(node, ast.filter.LessThan): - return self._lessThan(node, subject) - elif isinstance(node, ast.filter.GreaterThan): - return self._greaterThan(node, subject) - elif isinstance(node, ast.filter.Equals): - return self._equals(node, subject, numerical=True) - else: - raise errors.ConsistencyError(f'Expected a numerical expression, found {node}') - - - def __branch(self, node: typing.Union[ast.filter.Any, ast.filter.And], subject: bsc.types._Vertex): - # subject is a node type - if not isinstance(subject, bsc.Node): - raise errors.ConsistencyError(f'Expected a node, found {subject}') - # subject exists in the schema - if subject not in self.schema.nodes: - raise errors.ConsistencyError(f'Invalid node type {subject}') - # predicate is valid - dom, rng = self._parse_predicate_expression(node.predicate) - # subject is a subtype of the predicate's domain - if not subject <= dom: - raise errors.ConsistencyError(f'Expected type {dom}, found {subject}') - # child expression is valid - self._parse_filter_expression(node.expr, rng) + ## routing methods + + def _parse_filter_expression(self, type_: T_VERTEX, node: ast.filter.FilterExpression): + """Route *node* to the handler of the respective FilterExpression subclass.""" + if isinstance(node, ast.filter.Is): + return self._is(type_, node) + if isinstance(node, ast.filter.Not): + return self._not(type_, node) + if isinstance(node, ast.filter.Has): + return self._has(type_, node) + if isinstance(node, (ast.filter.Any, ast.filter.All)): + return self._branch(type_, node) + if isinstance(node, (ast.filter.And, ast.filter.Or)): + return self._agg(type_, node) + if isinstance(node, (ast.filter.Equals, ast.filter.Substring, ast.filter.StartsWith, ast.filter.EndsWith)): + return self._value(type_, node) + if isinstance(node, (ast.filter.LessThan, ast.filter.GreaterThan)): + return self._bounded(type_, node) + # invalid node + raise errors.BackendError(f'expected filter expression, found {node}') + + def _parse_predicate_expression(self, node: ast.filter.PredicateExpression) -> typing.Tuple[T_VERTEX, T_VERTEX]: + """Route *node* to the handler of the respective PredicateExpression subclass.""" + if isinstance(node, ast.filter.Predicate): + return self._predicate(node) + if isinstance(node, ast.filter.OneOf): + return self._one_of(node) + # invalid node + raise errors.BackendError(f'expected predicate expression, found {node}') + + + ## predicate expressions + + def _predicate(self, node: ast.filter.Predicate) -> typing.Tuple[T_VERTEX, T_VERTEX]: + # predicate exists in the schema + if not self.schema.has_predicate(node.predicate): + raise errors.ConsistencyError(f'predicate {node.predicate} is not in the schema') + # determine domain and range + pred = self.schema.predicate(node.predicate) + dom, rng = pred.domain, pred.range + if rng is None: + # FIXME: It is a design error that Predicates can have a None range... + raise errors.BackendError(f'predicate {pred} has no range') + if node.reverse: + dom, rng = rng, dom # type: ignore [assignment] # variable re-use confuses mypy + # return domain and range + return dom, rng - def _any(self, node: ast.filter.Any, subject: bsc.types._Vertex): - return self.__branch(node, subject) + def _one_of(self, node: ast.filter.OneOf) -> typing.Tuple[T_VERTEX, T_VERTEX]: + # determine domain and range types + # NOTE: select the most specific domain and the most generic range + dom, rng = None, None + for pred in node: + # parse child expression + subdom, subrng = self._parse_predicate_expression(pred) + try: + # determine overall domain + if dom is None or subdom < dom: # pick most specific domain + dom = subdom + # domains must be related across all child expressions + if not subdom <= dom and not subdom >= dom: + raise errors.ConsistencyError(f'domains {subdom} and {dom} are not related') + except TypeError as err: # compared literal vs. node + raise errors.ConsistencyError(f'domains {subdom} and {dom} are not of the same type') from err - def _all(self, node: ast.filter.All, subject: bsc.types._Vertex): - return self.__branch(node, subject) + try: + # determine overall range + if rng is None or subrng > rng: # pick most generic range + rng = subrng + # ranges must be related across all child expressions + if not subrng <= rng and not subrng >= rng: + raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related') + except TypeError as err: # compared literal vs. node + raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not of the same type') from err + # check domain and range + if dom is None or rng is None: + # OneOf guarantees at least one expression, these two cases cannot happen + raise errors.UnreachableError() + # return domain and range + return dom, rng - def __agg(self, node: typing.Union[ast.filter.And, ast.filter.Or], subject: bsc.types._Vertex): + ## intermediates + + def _branch(self, type_: T_VERTEX, node: ast.filter._Branch): + # type is a Node + if not isinstance(type_, bsc.Node): + raise errors.ConsistencyError(f'expected a Node, found {type_}') + # type exists in the schema + # FIXME: Isn't it actually guaranteed that the type (except the root type) is part of the schema? + # all types can be traced back to (a) root_type, (b) predicate, or (c) manually set (e.g. in _is). + # For (a), we do (and have to) perform a check. For (c), the code base should be consistent throughout + # the module, so this is an assumption that has to be ensured in schema.Schema. For (b), we know (and + # check) that the predicate is in the schema, hence all node/literals derived from it are also in the + # schema by construction of the schema.Schema class. So, why do we check this every time? + if type_ not in self.schema.nodes(): + raise errors.ConsistencyError(f'node {type_} is not in the schema') + # predicate is valid + dom, rng = self._parse_predicate_expression(node.predicate) + # type_ is a subtype of the predicate's domain + if not type_ <= dom: + raise errors.ConsistencyError(f'expected type {dom} or subtype thereof, found {type_}') + # child expression is valid + self._parse_filter_expression(rng, node.expr) + + def _agg(self, type_: T_VERTEX, node: ast.filter._Agg): for expr in node: # child expression is valid - self._parse_filter_expression(expr, subject) - - def _and(self, node: ast.filter.And, subject: bsc.types._Vertex): - return self.__agg(node, subject) - - def _or(self, node: ast.filter.Or, subject: bsc.types._Vertex): - return self.__agg(node, subject) - + self._parse_filter_expression(type_, expr) - def _not(self, node: ast.filter.Not, subject: bsc.types._Vertex): + def _not(self, type_: T_VERTEX, node: ast.filter.Not): # child expression is valid - self._parse_filter_expression(node.expr, subject) - - - def _has(self, node: ast.filter.Has, subject: bsc.types._Vertex): - # subject is a node type - if not isinstance(subject, bsc.Node): - raise errors.ConsistencyError(f'Expected a node, found {subject}') - # subject exists in the schema - if subject not in self.schema.nodes: - raise errors.ConsistencyError(f'Invalid node type {subject}') + self._parse_filter_expression(type_, node.expr) + + def _has(self, type_: T_VERTEX, node: ast.filter.Has): + # type is a Node + if not isinstance(type_, bsc.Node): + raise errors.ConsistencyError(f'expected a Node, found {type_}') + # type exists in the schema + if type_ not in self.schema.nodes(): + raise errors.ConsistencyError(f'node {type_} is not in the schema') # predicate is valid - dom, rng = self._parse_predicate_expression(node.predicate) - # subject is a subtype of the predicate's domain - if not subject <= dom: - raise errors.ConsistencyError(f'Expected type {dom}, found {subject}') + dom, _= self._parse_predicate_expression(node.predicate) + # type_ is a subtype of the predicate's domain + if not type_ <= dom: + raise errors.ConsistencyError(f'expected type {dom}, found {type_}') # node.count is a numerical expression - self._parse_numerical_expression(node.count, self.schema.literal(ns.xsd.numerical)) - - - def _equals(self, node: ast.filter.Equals, subject: bsc.types._Vertex, numerical: bool = False): - # subject is a literal - #if not isinstance(subject, bsc.Literal): - # raise errors.ConsistencyError(f'Expected a literal, found {subject}') - if isinstance(subject, bsc.Node): - # FIXME: How to handle this case? - # FIXME: How to check if a NodeType is acceptable? - # FIXME: Maybe use flags to control what is expected as node identifiers? - from bsfs.graph.nodes import Nodes # FIXME - if not isinstance(node.value, Nodes) and not isinstance(node.value, URI): - raise errors.ConsistencyError(f'Expected a Nodes or URI, found {node.value}') - elif isinstance(subject, bsc.Literal): - # literal exists in the schema - if subject not in self.schema.literals: - raise errors.ConsistencyError(f'Invalid literal type {subject}') - else: - # FIXME: - raise errors.ConsistencyError(f'Expected a literal, found {subject}') - # node.value is numeric (if requested) - if numerical and not isinstance(node.value, float) and not isinstance(node.value, int): - raise errors.ConsistencyError(f'Expected a numerical value (int or float), found {node.value}') - # NOTE: We cannot check if node.value agrees with the subject since we don't know - # all literal types, their hierarchy, and how the backend converts datatypes. - - - def _substring(self, node: ast.filter.Substring, subject: bsc.types._Vertex): - # subject is a literal - if not isinstance(subject, bsc.Literal): - raise errors.ConsistencyError(f'Expected a literal, found {subject}') - # literal exists in the schema - if subject not in self.schema.literals: - raise errors.ConsistencyError(f'Invalid literal type {subject}') - # node.value matches literal datatype - if not subject.is_a(ns.xsd.string): - raise errors.ConsistencyError(f'Expected a string literal, found {subject}') - - - def _lessThan(self, node: ast.filter.LessThan, subject: bsc.types._Vertex): - # subject is a literal - if not isinstance(subject, bsc.Literal): - raise errors.ConsistencyError(f'Expected a literal, found {subject}') - # literal exists in the schema - if subject not in self.schema.literals: - raise errors.ConsistencyError(f'Invalid literal type {subject}') - # subject is numerical - if not subject.is_a(ns.xsd.numerical): - raise errors.ConsistencyError(f'Expected a numerical literal, found {subject}') - - - def _greaterThan(self, node: ast.filter.GreaterThan, subject: bsc.types._Vertex): - # subject is a literal - if not isinstance(subject, bsc.Literal): - raise errors.ConsistencyError(f'Expected a literal, found {subject}') - # literal exists in the schema - if subject not in self.schema.literals: - raise errors.ConsistencyError(f'Invalid literal type {subject}') - # subject is numerical - if not subject.is_a(ns.xsd.numerical): - raise errors.ConsistencyError(f'Expected a numerical literal, found {subject}') - - - def _predicate(self, node: ast.filter.Predicate): - try: - # predicate exists in the schema - pred = self.schema.predicate(node.predicate) - except KeyError: - raise errors.ConsistencyError(f'') # FIXME - if node.reverse: - return pred.range, pred.domain - else: - return pred.domain, pred.range - + # FIXME: We have to ensure that ns.xsd.integer is always known in the schema! + self._parse_filter_expression(self.schema.literal(ns.xsd.integer), node.count) + + + ## conditions + + def _is(self, type_: T_VERTEX, node: ast.filter.Is): # pylint: disable=unused-argument # (node) + if not isinstance(type_, bsc.Node): + raise errors.ConsistencyError(f'expected a Node, found {type_}') + if type_ not in self.schema.nodes(): + raise errors.ConsistencyError(f'node {type_} is not in the schema') + + def _value(self, type_: T_VERTEX, node: ast.filter._Value): # pylint: disable=unused-argument # (node) + # type is a literal + if not isinstance(type_, bsc.Literal): + raise errors.ConsistencyError(f'expected a Literal, found {type_}') + # type exists in the schema + if type_ not in self.schema.literals(): + raise errors.ConsistencyError(f'literal {type_} is not in the schema') + # FIXME: Check if node.value corresponds to type_ + # FIXME: A specific literal might be requested (i.e., a numeric type when used in Has) + + def _bounded(self, type_: T_VERTEX, node: ast.filter._Bounded): # pylint: disable=unused-argument # (node) + # type is a literal + if not isinstance(type_, bsc.Literal): + raise errors.ConsistencyError(f'expected a Literal, found {type_}') + # type exists in the schema + if type_ not in self.schema.literals(): + raise errors.ConsistencyError(f'literal {type_} is not in the schema') + # FIXME: Check if node.value corresponds to type_ - def _oneOf(self, node: ast.filter.OneOf): - dom, rng = None, None - for pred in node: - try: - # parse child expression - subdom, subrng = self._parse_predicate_expression(pred) - # domain and range must be related across all child expressions - if not subdom <= dom and not subdom >= dom: - raise errors.ConsistencyError(f'') # FIXME - if not subrng <= rng and not subrng >= rng: - raise errors.ConsistencyError(f'') # FIXME - # determine overall domain and range - if dom is None or subdom < dom: # pick most specific domain - dom = subdom - if rng is None or subrng > rng: # pick most generic range - rng = subrng - except KeyError: - raise errors.ConsistencyError(f'') - return dom, rng ## EOF ## diff --git a/bsfs/utils/__init__.py b/bsfs/utils/__init__.py index 94680ee..6737cef 100644 --- a/bsfs/utils/__init__.py +++ b/bsfs/utils/__init__.py @@ -9,7 +9,7 @@ import typing # inner-module imports from . import errors -from .commons import typename +from .commons import typename, normalize_args from .uri import URI from .uuid import UUID, UCID @@ -19,6 +19,7 @@ __all__ : typing.Sequence[str] = ( 'URI', 'UUID', 'errors', + 'normalize_args', 'typename', ) diff --git a/bsfs/utils/commons.py b/bsfs/utils/commons.py index bad2fe0..e9f0b7f 100644 --- a/bsfs/utils/commons.py +++ b/bsfs/utils/commons.py @@ -5,10 +5,12 @@ A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ # imports +from collections import abc import typing # exports __all__: typing.Sequence[str] = ( + 'normalize_args', 'typename', ) @@ -19,5 +21,37 @@ def typename(obj) -> str: """Return the type name of *obj*.""" return type(obj).__name__ +# argument type in `normalize_args`. +ArgType = typing.TypeVar('ArgType') # pylint: disable=invalid-name # type vars don't follow the usual convention + +def normalize_args( + *args: typing.Union[ArgType, typing.Iterable[ArgType], typing.Iterator[ArgType]] + ) -> typing.Tuple[ArgType, ...]: + """Arguments to a function can be passed as individual arguments, list-like + structures, or iterables. This function processes any of these styles and + returns a tuple of the respective items. Typically used within a function + provide a flexible interface but sill have parameters in a normalized form. + + Examples: + + >>> normalize_args(0,1,2) + (1,2,3) + >>> normalize_args([0,1,2]) + (1,2,3) + >>> normalize_args(range(3)) + (1,2,3) + + """ + if len(args) == 0: # foo() + return tuple() + if len(args) > 1: # foo(0, 1, 2) + return tuple(args) # type: ignore [arg-type] # we assume that argument styles (arg vs. iterable) are not mixed. + if isinstance(args[0], abc.Iterator): # foo(iter([0,1,2])) + return tuple(args[0]) + if isinstance(args[0], abc.Iterable) and not isinstance(args[0], str): # foo([0, 1, 2]) + return tuple(args[0]) + # foo(0) + return (args[0], ) # type: ignore [return-value] # if args[0] is a str, we assume that ArgType was str. + ## EOF ## diff --git a/bsfs/utils/errors.py b/bsfs/utils/errors.py index c5e8e16..be9d40e 100644 --- a/bsfs/utils/errors.py +++ b/bsfs/utils/errors.py @@ -38,4 +38,7 @@ class UnreachableError(ProgrammingError): class ConfigError(_BSFSError): """User config issue.""" +class BackendError(_BSFSError): + """Could not parse an AST structure.""" + ## EOF ## -- cgit v1.2.3 From 73e39cb4967949025aefe874f401e27b0abb772c Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Thu, 22 Dec 2022 20:29:57 +0100 Subject: filter ast parser and get method in sparql store --- bsfs/triple_store/base.py | 6 +- bsfs/triple_store/sparql/parse_filter.py | 307 +++++++++++++++++++++++++++++++ bsfs/triple_store/sparql/sparql.py | 51 ++++- 3 files changed, 357 insertions(+), 7 deletions(-) create mode 100644 bsfs/triple_store/sparql/parse_filter.py (limited to 'bsfs') diff --git a/bsfs/triple_store/base.py b/bsfs/triple_store/base.py index 5ff9523..7e03714 100644 --- a/bsfs/triple_store/base.py +++ b/bsfs/triple_store/base.py @@ -113,9 +113,11 @@ class TripleStoreBase(abc.ABC): def get( self, node_type: _schema.Node, - query: ast.filter.FilterExpression, + query: typing.Optional[ast.filter.FilterExpression] = None, ) -> typing.Iterator[URI]: - """Return guids of nodes of type *node_type* that match the *query*.""" + """Return guids of nodes of type *node_type* that match the *query*. + Return all guids of the respective type if *query* is None. + """ @abc.abstractmethod def exists( diff --git a/bsfs/triple_store/sparql/parse_filter.py b/bsfs/triple_store/sparql/parse_filter.py new file mode 100644 index 0000000..d4db0aa --- /dev/null +++ b/bsfs/triple_store/sparql/parse_filter.py @@ -0,0 +1,307 @@ +""" + +Part of the BlackStar filesystem (bsfs) module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# bsfs imports +from bsfs import schema as bsc +from bsfs.namespace import ns +from bsfs.query import ast +from bsfs.utils import URI, errors + +# exports +__all__: typing.Sequence[str] = ( + 'Filter', + ) + +class _GenHopName(): + """Generator that produces a new unique symbol name with each iteration.""" + + # Symbol name prefix. + prefix: str + + # Current counter. + curr: int + + def __init__(self, prefix: str = '?hop', start: int = 0): + self.prefix = prefix + self.curr = start - 1 + + def __next__(self): + """Generate and return the next unique name.""" + self.curr += 1 + return self.prefix + str(self.curr) + + +class Filter(): + """Translate `bsfs.query.ast.filter` structures into Sparql queries.""" + + # Current schema to validate against. + schema: bsc.Schema + + # Generator that produces unique symbol names. + ngen: _GenHopName + + # Vertex type. + T_VERTEX = typing.Union[bsc.Node, bsc.Literal] + + def __init__(self, schema): + self.schema = schema + self.ngen = _GenHopName() + + def __call__( + self, + root_type: bsc.Node, + root: typing.Optional[ast.filter.FilterExpression] = None, + ) -> str: + """ + """ + # check root_type + if not isinstance(root_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {root_type}') + if root_type not in self.schema.nodes(): + raise errors.ConsistencyError(f'node {root_type} is not in the schema') + # parse root + if root is None: + cond = '' + else: + cond = self._parse_filter_expression(root_type, root, '?ent') + # assemble query + return f''' + SELECT ?ent + WHERE {{ + ?ent <{ns.rdf.type}>/<{ns.rdfs.subClassOf}>* <{root_type.uri}> . + {cond} + }} + ''' + + def _parse_filter_expression(self, type_: T_VERTEX, node: ast.filter.FilterExpression, head: str) -> str: + """Route *node* to the handler of the respective FilterExpression subclass.""" + if isinstance(node, ast.filter.Is): + return self._is(type_, node, head) + if isinstance(node, ast.filter.Not): + return self._not(type_, node, head) + if isinstance(node, ast.filter.Has): + return self._has(type_, node, head) + if isinstance(node, ast.filter.Any): + return self._any(type_, node, head) + if isinstance(node, ast.filter.All): + return self._all(type_, node, head) + if isinstance(node, ast.filter.And): + return self._and(type_, node, head) + if isinstance(node, ast.filter.Or): + return self._or(type_, node, head) + if isinstance(node, ast.filter.Equals): + return self._equals(type_, node, head) + if isinstance(node, ast.filter.Substring): + return self._substring(type_, node, head) + if isinstance(node, ast.filter.StartsWith): + return self._starts_with(type_, node, head) + if isinstance(node, ast.filter.EndsWith): + return self._ends_with(type_, node, head) + if isinstance(node, ast.filter.LessThan): + return self._less_than(type_, node, head) + if isinstance(node, ast.filter.GreaterThan): + return self._greater_than(type_, node, head) + # invalid node + raise errors.BackendError(f'expected filter expression, found {node}') + + def _parse_predicate_expression( + self, + type_: T_VERTEX, + node: ast.filter.PredicateExpression + ) -> typing.Tuple[str, T_VERTEX]: + """Route *node* to the handler of the respective PredicateExpression subclass.""" + if isinstance(node, ast.filter.Predicate): + return self._predicate(type_, node) + if isinstance(node, ast.filter.OneOf): + return self._one_of(type_, node) + # invalid node + raise errors.BackendError(f'expected predicate expression, found {node}') + + def _one_of(self, node_type: T_VERTEX, node: ast.filter.OneOf) -> typing.Tuple[str, T_VERTEX]: + """ + """ + if not isinstance(node_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {node_type}') + # walk through predicates + suburi, rng = set(), None + for pred in node: # OneOf guarantees at least one expression + puri, subrng = self._parse_predicate_expression(node_type, pred) + # track predicate uris + suburi.add(puri) + try: + # check for more generic range + if rng is None or subrng > rng: + rng = subrng + # check range consistency + if not subrng <= rng and not subrng >= rng: + raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related') + except TypeError as err: # subrng and rng are not comparable + raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related') from err + if rng is None: + # for mypy to be certain of the rng type + # if rng were None, we'd have gotten a TypeError above (None > None) + raise errors.UnreachableError() + # return joint predicate expression and next range + return '|'.join(suburi), rng + + def _predicate(self, node_type: T_VERTEX, node: ast.filter.Predicate) -> typing.Tuple[str, T_VERTEX]: + """ + """ + # check node_type + if not isinstance(node_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {node_type}') + # fetch predicate and its uri + puri = node.predicate + # get and check predicate, domain, and range + if not self.schema.has_predicate(puri): + raise errors.ConsistencyError(f'predicate {puri} is not in the schema') + pred = self.schema.predicate(puri) + if pred.range is None: + # FIXME: It is a design error that Predicates can have a None range... + raise errors.BackendError(f'predicate {pred} has no range') + dom, rng = pred.domain, pred.range + # encapsulate predicate uri + puri = f'<{puri}>' # type: ignore [assignment] # variable re-use confuses mypy + # apply reverse flag + if node.reverse: + puri = URI('^' + puri) + dom, rng = rng, dom # type: ignore [assignment] # variable re-use confuses mypy + # check path consistency + if not node_type <= dom: + raise errors.ConsistencyError(f'expected type {dom} or subtype thereof, found {node_type}') + # return predicate URI and next node type + return puri, rng + + def _any(self, node_type: T_VERTEX, node: ast.filter.Any, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {node_type}') + # parse predicate + pred, next_type = self._parse_predicate_expression(node_type, node.predicate) + # parse expression + nexthead = next(self.ngen) + expr = self._parse_filter_expression(next_type, node.expr, nexthead) + # combine results + return f'{head} {pred} {nexthead} . {expr}' + + def _all(self, node_type: T_VERTEX, node: ast.filter.All, head: str) -> str: + """ + """ + # NOTE: All(P, E) := Not(Any(P, Not(E))) and EXISTS(P, ?) + if not isinstance(node_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {node_type}') + # parse rewritten ast + expr = self._parse_filter_expression(node_type, + ast.filter.Not( + ast.filter.Any(node.predicate, + ast.filter.Not(node.expr))), head) + # parse predicate for existence constraint + pred, _ = self._parse_predicate_expression(node_type, node.predicate) + temphead = next(self.ngen) + # return existence and rewritten expression + return f'FILTER EXISTS {{ {head} {pred} {temphead} }} . ' + expr + + def _and(self, node_type: T_VERTEX, node: ast.filter.And, head: str) -> str: + """ + """ + sub = [self._parse_filter_expression(node_type, expr, head) for expr in node] + return ' . '.join(sub) + + def _or(self, node_type: T_VERTEX, node: ast.filter.Or, head: str) -> str: + """ + """ + # potential special case optimization: + # * ast: Or(Equals('foo'), Equals('bar'), ...) + # * query: VALUES ?head { "value1"^^<...> "value2"^^<...> "value3"^<...> ... } + sub = [self._parse_filter_expression(node_type, expr, head) for expr in node] + sub = ['{' + expr + '}' for expr in sub] + return ' UNION '.join(sub) + + def _not(self, node_type: T_VERTEX, node: ast.filter.Not, head: str) -> str: + """ + """ + expr = self._parse_filter_expression(node_type, node.expr, head) + if isinstance(node_type, bsc.Literal): + return f'MINUS {{ {expr} }}' + # NOTE: for bsc.Node types, we must include at least one expression in the body of MINUS, + # otherwise the connection between the context and body of MINUS is lost. + # The simplest (and non-interfering) choice is a type statement. + return f'MINUS {{ {head} <{ns.rdf.type}>/<{ns.rdfs.subClassOf}>* <{node_type.uri}> . {expr} }}' + + def _has(self, node_type: T_VERTEX, node: ast.filter.Has, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {node_type}') + # parse predicate + pred, _ = self._parse_predicate_expression(node_type, node.predicate) + # get new heads + inner = next(self.ngen) + outer = next(self.ngen) + # predicate count expression (fetch number of predicates at *head*) + num_preds = f'{{ SELECT (COUNT(distinct {inner}) as {outer}) WHERE {{ {head} {pred} {inner} }} }}' + # count expression + # FIXME: We have to ensure that ns.xsd.integer is always known in the schema! + count_bounds = self._parse_filter_expression(self.schema.literal(ns.xsd.integer), node.count, outer) + # combine + return num_preds + ' . ' + count_bounds + + def _is(self, node_type: T_VERTEX, node: ast.filter.Is, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {node_type}') + return f'VALUES {head} {{ <{node.value}> }}' + + def _equals(self, node_type: T_VERTEX, node: ast.filter.Equals, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Literal): + raise errors.BackendError(f'expected Literal, found {node}') + return f'VALUES {head} {{ "{node.value}"^^<{node_type.uri}> }}' + + def _substring(self, node_type: T_VERTEX, node: ast.filter.Substring, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Literal): + raise errors.BackendError(f'expected Literal, found {node_type}') + return f'FILTER contains(str({head}), "{node.value}")' + + def _starts_with(self, node_type: T_VERTEX, node: ast.filter.StartsWith, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Literal): + raise errors.BackendError(f'expected Literal, found {node_type}') + return f'FILTER strstarts(str({head}), "{node.value}")' + + def _ends_with(self, node_type: T_VERTEX, node: ast.filter.EndsWith, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Literal): + raise errors.BackendError(f'expected Literal, found {node_type}') + return f'FILTER strends(str({head}), "{node.value}")' + + def _less_than(self, node_type: T_VERTEX, node: ast.filter.LessThan, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Literal): + raise errors.BackendError(f'expected Literal, found {node_type}') + equality = '=' if not node.strict else '' + return f'FILTER ({head} <{equality} {float(node.threshold)})' + + def _greater_than(self, node_type: T_VERTEX, node: ast.filter.GreaterThan, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Literal): + raise errors.BackendError(f'expected Literal, found {node_type}') + equality = '=' if not node.strict else '' + return f'FILTER ({head} >{equality} {float(node.threshold)})' + +## EOF ## diff --git a/bsfs/triple_store/sparql/sparql.py b/bsfs/triple_store/sparql/sparql.py index 7172f34..c3cbff6 100644 --- a/bsfs/triple_store/sparql/sparql.py +++ b/bsfs/triple_store/sparql/sparql.py @@ -15,6 +15,7 @@ from bsfs.query import ast from bsfs.utils import errors, URI # inner-module imports +from . import parse_filter from .. import base @@ -86,11 +87,15 @@ class SparqlStore(base.TripleStoreBase): # The local schema. _schema: bsc.Schema + # Filter parser + _filter_parser: parse_filter.Filter + def __init__(self): super().__init__(None) self._graph = rdflib.Graph() self._transaction = _Transaction(self._graph) self._schema = bsc.Schema.Empty() + self._filter_parser = parse_filter.Filter(self._schema) # NOTE: mypy and pylint complain about the **kwargs not being listed (contrasting super) # However, not having it here is clearer since it's explicit that there are no arguments. @@ -127,10 +132,17 @@ class SparqlStore(base.TripleStoreBase): # get deleted classes sub = self.schema - schema - # remove predicate instances for pred in sub.predicates: + # remove predicate instances for src, trg in self._graph.subject_objects(rdflib.URIRef(pred.uri)): self._transaction.remove((src, rdflib.URIRef(pred.uri), trg)) + # remove predicate definition + if pred.parent is not None: + self._transaction.remove(( + rdflib.URIRef(pred.uri), + rdflib.RDFS.subClassOf, + rdflib.URIRef(pred.parent.uri), + )) # remove node instances for node in sub.nodes: @@ -144,17 +156,46 @@ class SparqlStore(base.TripleStoreBase): self._transaction.remove((inst, pred, trg)) # remove instance self._transaction.remove((inst, rdflib.RDF.type, rdflib.URIRef(node.uri))) - - # NOTE: Nothing to do for literals + # remove node definition + if node.parent is not None: + self._transaction.remove(( + rdflib.URIRef(node.uri), + rdflib.RDFS.subClassOf, + rdflib.URIRef(node.parent.uri), + )) + + for lit in sub.literals: + # remove literal definition + if lit.parent is not None: + self._transaction.remove(( + rdflib.URIRef(lit.uri), + rdflib.RDFS.subClassOf, + rdflib.URIRef(lit.parent.uri), + )) + + # add predicate, node, and literal hierarchies to the graph + for itm in itertools.chain(schema.predicates(), schema.nodes(), schema.literals()): + if itm.parent is not None: + self._transaction.add((rdflib.URIRef(itm.uri), rdflib.RDFS.subClassOf, rdflib.URIRef(itm.parent.uri))) # commit instance changes self.commit() # migrate schema self._schema = schema + self._filter_parser.schema = schema - def get(self, node_type: bsc.Node, query: ast.filter.FilterExpression) -> typing.Iterator[URI]: - raise NotImplementedError() + def get( + self, + node_type: bsc.Node, + query: typing.Optional[ast.filter.FilterExpression] = None, + ) -> typing.Iterator[URI]: + if node_type not in self.schema.nodes(): + raise errors.ConsistencyError(f'{node_type} is not defined in the schema') + if not isinstance(query, ast.filter.FilterExpression): + raise TypeError(query) + for guid, in self._graph.query(self._filter_parser(node_type, query)): + yield URI(guid) def _has_type(self, subject: URI, node_type: bsc.Node) -> bool: """Return True if *subject* is a node of class *node_type* or a subclass thereof.""" -- cgit v1.2.3 From ca7ee6c59d2eb3f4ec4d16e392d12d946cd85e4d Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Thu, 22 Dec 2022 20:33:00 +0100 Subject: filter-ast based get interface in graph. * Graph interface: Graph.get added * Node instance resolver so that Nodes can be used in a filter ast * AC interface: filter_read added to interface * upstream test adjustments of previous sparql store changes --- bsfs/graph/ac/base.py | 4 ++ bsfs/graph/ac/null.py | 5 ++ bsfs/graph/graph.py | 28 +++++++-- bsfs/graph/resolve.py | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 193 insertions(+), 5 deletions(-) create mode 100644 bsfs/graph/resolve.py (limited to 'bsfs') diff --git a/bsfs/graph/ac/base.py b/bsfs/graph/ac/base.py index bc9aeb3..0703e2e 100644 --- a/bsfs/graph/ac/base.py +++ b/bsfs/graph/ac/base.py @@ -10,6 +10,7 @@ import typing # bsfs imports from bsfs import schema +from bsfs.query import ast from bsfs.triple_store import TripleStoreBase from bsfs.utils import URI @@ -67,5 +68,8 @@ class AccessControlBase(abc.ABC): def createable(self, node_type: schema.Node, guids: typing.Iterable[URI]) -> typing.Iterable[URI]: """Return nodes that are allowed to be created.""" + @abc.abstractmethod + def filter_read(self, node_type: schema.Node, query: ast.filter.FilterExpression) -> ast.filter.FilterExpression: + """Re-write a filter *query* to get (i.e., read) *node_type* nodes.""" ## EOF ## diff --git a/bsfs/graph/ac/null.py b/bsfs/graph/ac/null.py index 36838bd..12b4e87 100644 --- a/bsfs/graph/ac/null.py +++ b/bsfs/graph/ac/null.py @@ -10,6 +10,7 @@ import typing # bsfs imports from bsfs import schema from bsfs.namespace import ns +from bsfs.query import ast from bsfs.utils import URI # inner-module imports @@ -49,4 +50,8 @@ class NullAC(base.AccessControlBase): """Return nodes that are allowed to be created.""" return guids + def filter_read(self, node_type: schema.Node, query: ast.filter.FilterExpression) -> ast.filter.FilterExpression: + """Re-write a filter *query* to get (i.e., read) *node_type* nodes.""" + return query + ## EOF ## diff --git a/bsfs/graph/graph.py b/bsfs/graph/graph.py index 51fe75d..f030fed 100644 --- a/bsfs/graph/graph.py +++ b/bsfs/graph/graph.py @@ -9,13 +9,15 @@ import os import typing # bsfs imports -from bsfs.query import ast +from bsfs.query import ast, validate from bsfs.schema import Schema from bsfs.triple_store import TripleStoreBase from bsfs.utils import URI, typename # inner-module imports +from . import ac from . import nodes as _nodes +from . import resolve # exports __all__: typing.Sequence[str] = ( @@ -44,6 +46,9 @@ class Graph(): def __init__(self, backend: TripleStoreBase, user: URI): self._backend = backend self._user = user + self._resolver = resolve.Filter(self._backend.schema) + self._validate = validate.Filter(self._backend.schema) + self._ac = ac.NullAC(self._backend, self._user) # ensure Graph schema requirements self.migrate(self._backend.schema) @@ -85,6 +90,9 @@ class Graph(): # migrate schema in backend # FIXME: consult access controls! self._backend.schema = schema + # re-initialize members + self._resolver.schema = self.schema + self._validate.schema = self.schema # return self return self @@ -108,11 +116,21 @@ class Graph(): *node_type*) once some data is assigned to them. """ - type_ = self.schema.node(node_type) - return _nodes.Nodes(self._backend, self._user, type_, {guid}) + return self.nodes(node_type, {guid}) - def get(self, node_type: URI, subject: ast.filter.FilterExpression) -> _nodes.Nodes: + def get(self, node_type: URI, query: ast.filter.FilterExpression) -> _nodes.Nodes: # FIXME: How about empty query? """Return a `Nodes` instance over all nodes of type *node_type* that match the *subject* query.""" - raise NotImplementedError() + # get node type + type_ = self.schema.node(node_type) + # resolve Nodes instances + query = self._resolver(type_, query) + # add access controls to query + query = self._ac.filter_read(type_, query) + # validate query + self._validate(type_, query) + # query the backend + guids = self._backend.get(type_, query) # no need to materialize + # return Nodes instance + return _nodes.Nodes(self._backend, self._user, type_, guids) ## EOF ## diff --git a/bsfs/graph/resolve.py b/bsfs/graph/resolve.py new file mode 100644 index 0000000..feb0855 --- /dev/null +++ b/bsfs/graph/resolve.py @@ -0,0 +1,161 @@ +""" + +Part of the BlackStar filesystem (bsfs) module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# bsfs imports +from bsfs import schema as bsc +from bsfs.query import ast +from bsfs.utils import errors + +# inner-module imports +from . import nodes + +# exports +__all__: typing.Sequence[str] = ( + 'Filter', + ) + + +## code ## + +class Filter(): + """Rewrites the query to replace `bsfs.graph.nodes.Nodes` instances with the respective URI. + Does only limited type checking and schema validation. + Use `bsfs.schema.validate.Filter` to do so. + + Example: + input: Any(ns.bse.tag, Is(Nodes(...))) + output: Any(ns.bse.tag, Or(Is(...), Is(...), ...))) + + >>> tags = graph.node(ns.bsfs.Tag, 'http://example.com/me/tag#1234') + >>> graph.get(ns.bsfs.Entity, ast.filter.Any(ns.bse.tag, ast.filter.Is(tags))) + + """ + + T_VERTEX = typing.Union[bsc.Node, bsc.Literal] + + def __init__(self, schema): + self.schema = schema + + def __call__(self, root_type: bsc.Node, node: ast.filter.FilterExpression): + return self._parse_filter_expression(root_type, node) + + def _parse_filter_expression( + self, + type_: T_VERTEX, + node: ast.filter.FilterExpression, + ) -> ast.filter.FilterExpression: + """Route *node* to the handler of the respective FilterExpression subclass.""" + if isinstance(node, ast.filter.Is): + return self._is(type_, node) + if isinstance(node, ast.filter.Not): + return self._not(type_, node) + if isinstance(node, ast.filter.Has): + return self._has(type_, node) + if isinstance(node, ast.filter.Any): + return self._any(type_, node) + if isinstance(node, ast.filter.All): + return self._all(type_, node) + if isinstance(node, ast.filter.And): + return self._and(type_, node) + if isinstance(node, ast.filter.Or): + return self._or(type_, node) + if isinstance(node, (ast.filter.Equals, ast.filter.Substring, \ + ast.filter.StartsWith, ast.filter.EndsWith)): + return self._value(type_, node) + if isinstance(node, (ast.filter.LessThan, ast.filter.GreaterThan)): + return self._bounded(type_, node) + # invalid node + raise errors.BackendError(f'expected filter expression, found {node}') + + def _parse_predicate_expression(self, node: ast.filter.PredicateExpression) -> T_VERTEX: + """Route *node* to the handler of the respective PredicateExpression subclass.""" + if isinstance(node, ast.filter.Predicate): + return self._predicate(node) + if isinstance(node, ast.filter.OneOf): + return self._one_of(node) + # invalid node + raise errors.BackendError(f'expected predicate expression, found {node}') + + def _predicate(self, node: ast.filter.Predicate) -> T_VERTEX: + if not self.schema.has_predicate(node.predicate): + raise errors.ConsistencyError(f'predicate {node.predicate} is not in the schema') + pred = self.schema.predicate(node.predicate) + dom, rng = pred.domain, pred.range + if node.reverse: + dom, rng = rng, dom + return rng + + def _one_of(self, node: ast.filter.OneOf) -> T_VERTEX: + # determine domain and range types + rng = None + for pred in node: + # parse child expression + subrng = self._parse_predicate_expression(pred) + # determine the next type + try: + if rng is None or subrng > rng: # pick most generic range + rng = subrng + except TypeError as err: + raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related') from err + if rng is None: + raise errors.UnreachableError() + return rng + + def _any(self, type_: T_VERTEX, node: ast.filter.Any) -> ast.filter.Any: # pylint: disable=unused-argument + next_type = self._parse_predicate_expression(node.predicate) + return ast.filter.Any(node.predicate, self._parse_filter_expression(next_type, node.expr)) + + def _all(self, type_: T_VERTEX, node: ast.filter.All) -> ast.filter.All: # pylint: disable=unused-argument + next_type = self._parse_predicate_expression(node.predicate) + return ast.filter.All(node.predicate, self._parse_filter_expression(next_type, node.expr)) + + def _and(self, type_: T_VERTEX, node: ast.filter.And) -> ast.filter.And: + return ast.filter.And({self._parse_filter_expression(type_, expr) for expr in node}) + + def _or(self, type_: T_VERTEX, node: ast.filter.Or) -> ast.filter.Or: + return ast.filter.Or({self._parse_filter_expression(type_, expr) for expr in node}) + + def _not(self, type_: T_VERTEX, node: ast.filter.Not) -> ast.filter.Not: + return ast.filter.Not(self._parse_filter_expression(type_, node.expr)) + + def _has(self, type_: T_VERTEX, node: ast.filter.Has) -> ast.filter.Has: # pylint: disable=unused-argument + return node + + def _value(self, type_: T_VERTEX, node: ast.filter._Value) -> ast.filter._Value: # pylint: disable=unused-argument + return node + + def _bounded(self, type_: T_VERTEX, node: ast.filter._Bounded) -> ast.filter._Bounded: # pylint: disable=unused-argument + return node + + def _is(self, type_: T_VERTEX, node: ast.filter.Is) -> typing.Union[ast.filter.Or, ast.filter.Is]: + # check if action is needed + if not isinstance(node.value, nodes.Nodes): + return node + # check schema consistency + if node.value.node_type not in self.schema.nodes(): + raise errors.ConsistencyError(f'node {node.value.node_type} is not in the schema') + # check type compatibility + if not isinstance(type_, bsc.Node): + raise errors.ConsistencyError(f'expected a node, found {type_}') + if not node.value.node_type <= type_: + raise errors.ConsistencyError(f'expected type {type_} or subtype thereof, found {node.value.node_type}') + # NOTE: We assume that the node type is checked when writing to the backend. + # Links to any of the guids can therefore only exist if the type matches. + # Hence, we don't add a type check/constrain here. + return ast.filter.Or(ast.filter.Is(guid) for guid in node.value.guids) + # optimized code, removing unnecessary ast.filter.Or + #guids = set(node.value.guids) + #if len(guids) == 0: + # raise errors.BackendError(f'') + #if len(guids) == 1: + # return ast.filter.Nodeid(next(iter(guids))) + #return ast.filter.Or(ast.filter.Is(guid) for guid in guids) + + +## EOF ## -- cgit v1.2.3