diff options
author | Matthias Baumgartner <dev@igsor.net> | 2023-03-05 19:25:29 +0100 |
---|---|---|
committer | Matthias Baumgartner <dev@igsor.net> | 2023-03-05 19:25:29 +0100 |
commit | 48b6081d0092e9c5a1b0ad79bdde2e51649bf61a (patch) | |
tree | 634198c34aae3c0306ce30ac7452abd7b53a14e8 /bsfs | |
parent | 91437ba89d35bf482f3d9671bb99ef2fc69f5985 (diff) | |
parent | e4845c627e97a6d125bf33d9e7a4a8d373d7fc4a (diff) | |
download | bsfs-0.23.03.tar.gz bsfs-0.23.03.tar.bz2 bsfs-0.23.03.zip |
Merge branch 'develop'v0.23.03
Diffstat (limited to 'bsfs')
43 files changed, 3567 insertions, 424 deletions
diff --git a/bsfs/__init__.py b/bsfs/__init__.py index 079ffaf..cf08d64 100644 --- a/bsfs/__init__.py +++ b/bsfs/__init__.py @@ -1,9 +1,4 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import collections import typing diff --git a/bsfs/apps/__init__.py b/bsfs/apps/__init__.py index 7efaa87..62dc5b5 100644 --- a/bsfs/apps/__init__.py +++ b/bsfs/apps/__init__.py @@ -1,20 +1,53 @@ -""" +#!/usr/bin/env python3 -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports +import argparse import typing +# bsfs imports +import bsfs + # inner-module imports from .init import main as init from .migrate import main as migrate # exports __all__: typing.Sequence[str] = ( + 'main', 'init', 'migrate', ) +# config +apps = { + 'init' : init, + 'migrate' : migrate, + } + + +## code ## + +def main(argv=None): + """Black Star File System maintenance tools.""" + parser = argparse.ArgumentParser(description=main.__doc__, prog='bsfs') + # version + parser.add_argument('--version', action='version', + version='%(prog)s version {}.{}.{}'.format(*bsfs.version_info)) # pylint: disable=C0209 + # application selection + parser.add_argument('app', choices=apps.keys(), + help='Select the application to run.') + # dangling args + parser.add_argument('rest', nargs=argparse.REMAINDER) + # parse + args = parser.parse_args(argv) + # run application + apps[args.app](args.rest) + + +## main ## + +if __name__ == '__main__': + import sys + main(sys.argv[1:]) + ## EOF ## diff --git a/bsfs/apps/init.py b/bsfs/apps/init.py index 3e2ef37..9afbdd5 100644 --- a/bsfs/apps/init.py +++ b/bsfs/apps/init.py @@ -1,9 +1,5 @@ -""" +#!/usr/bin/env python3 -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import argparse import json @@ -60,9 +56,10 @@ def main(argv): # print config if args.output is not None: with open(args.output, mode='wt', encoding='UTF-8') as ofile: - json.dump(config, ofile) + json.dump(config, ofile, indent=4) else: - json.dump(config, sys.stdout) + json.dump(config, sys.stdout, indent=4) + print('') ## main ## diff --git a/bsfs/apps/migrate.py b/bsfs/apps/migrate.py index 91c1661..34ea2e7 100644 --- a/bsfs/apps/migrate.py +++ b/bsfs/apps/migrate.py @@ -1,9 +1,5 @@ -""" +#!/usr/bin/env python3 -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import argparse import json @@ -42,15 +38,15 @@ def main(argv): graph = bsfs.Open(config) # initialize schema - schema = bsfs.schema.Schema.Empty() + schema = bsfs.schema.Schema() if len(args.schema) == 0: # assemble schema from standard input - schema = schema + bsfs.schema.Schema.from_string(sys.stdin.read()) + schema = schema + bsfs.schema.from_string(sys.stdin.read()) else: # assemble schema from input files for pth in args.schema: with open(pth, mode='rt', encoding='UTF-8') as ifile: - schema = schema + bsfs.schema.Schema.from_string(ifile.read()) + schema = schema + bsfs.schema.from_string(ifile.read()) # migrate schema graph.migrate(schema, not args.remove) diff --git a/bsfs/front/__init__.py b/bsfs/front/__init__.py index 92886ab..cedcd7f 100644 --- a/bsfs/front/__init__.py +++ b/bsfs/front/__init__.py @@ -1,9 +1,4 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import typing diff --git a/bsfs/front/bsfs.py b/bsfs/front/bsfs.py index 968b3f5..f437212 100644 --- a/bsfs/front/bsfs.py +++ b/bsfs/front/bsfs.py @@ -1,9 +1,4 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import typing diff --git a/bsfs/front/builder.py b/bsfs/front/builder.py index 73f1703..b1d488b 100644 --- a/bsfs/front/builder.py +++ b/bsfs/front/builder.py @@ -1,14 +1,9 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import typing # bsfs imports -from bsfs.graph import Graph +from bsfs.graph import Graph, ac from bsfs.triple_store import TripleStoreBase, SparqlStore from bsfs.utils import URI, errors @@ -68,8 +63,10 @@ def build_graph(cfg: typing.Any) -> Graph: if 'backend' not in args: raise errors.ConfigError('required argument "backend" is not provided') backend = build_backend(args['backend']) + # build access controls + access_controls = ac.NullAC(backend, user) # build and return graph cls = _graph_classes[name] - return cls(backend, user) + return cls(backend, access_controls) ## EOF ## diff --git a/bsfs/graph/__init__.py b/bsfs/graph/__init__.py index 82d2235..8d38d23 100644 --- a/bsfs/graph/__init__.py +++ b/bsfs/graph/__init__.py @@ -1,9 +1,4 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import typing diff --git a/bsfs/graph/ac/__init__.py b/bsfs/graph/ac/__init__.py index 420de01..11b45df 100644 --- a/bsfs/graph/ac/__init__.py +++ b/bsfs/graph/ac/__init__.py @@ -1,9 +1,4 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import typing diff --git a/bsfs/graph/ac/base.py b/bsfs/graph/ac/base.py index bc9aeb3..e85c1dd 100644 --- a/bsfs/graph/ac/base.py +++ b/bsfs/graph/ac/base.py @@ -1,17 +1,13 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import abc import typing # bsfs imports from bsfs import schema +from bsfs.query import ast from bsfs.triple_store import TripleStoreBase -from bsfs.utils import URI +from bsfs.utils import URI, typename # exports __all__: typing.Sequence[str] = ( @@ -43,6 +39,20 @@ class AccessControlBase(abc.ABC): self._backend = backend self._user = URI(user) + def __str__(self) -> str: + return f'{typename(self)}({self._user})' + + def __repr__(self) -> str: + return f'{typename(self)}({self._user})' + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) \ + and self._backend == other._backend \ + and self._user == other._user + + def __hash__(self) -> int: + return hash((type(self), self._backend, self._user)) + @abc.abstractmethod def is_protected_predicate(self, pred: schema.Predicate) -> bool: """Return True if a predicate cannot be modified manually.""" @@ -67,5 +77,16 @@ class AccessControlBase(abc.ABC): def createable(self, node_type: schema.Node, guids: typing.Iterable[URI]) -> typing.Iterable[URI]: """Return nodes that are allowed to be created.""" + @abc.abstractmethod + def filter_read( + self, + node_type: schema.Node, + query: typing.Optional[ast.filter.FilterExpression], + ) -> typing.Optional[ast.filter.FilterExpression]: + """Re-write a filter *query* to get (i.e., read) *node_type* nodes.""" + + @abc.abstractmethod + def fetch_read(self, node_type: schema.Node, query: ast.fetch.FetchExpression) -> ast.fetch.FetchExpression: + """Re-write a fetch *query* to get (i.e, read) values for *node_type* nodes.""" ## EOF ## diff --git a/bsfs/graph/ac/null.py b/bsfs/graph/ac/null.py index 36838bd..c9ec7d0 100644 --- a/bsfs/graph/ac/null.py +++ b/bsfs/graph/ac/null.py @@ -1,15 +1,11 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import typing # bsfs imports from bsfs import schema from bsfs.namespace import ns +from bsfs.query import ast from bsfs.utils import URI # inner-module imports @@ -28,7 +24,7 @@ class NullAC(base.AccessControlBase): def is_protected_predicate(self, pred: schema.Predicate) -> bool: """Return True if a predicate cannot be modified manually.""" - return pred.uri == ns.bsm.t_created + return pred.uri == ns.bsn.t_created def create(self, node_type: schema.Node, guids: typing.Iterable[URI]): """Perform post-creation operations on nodes, e.g. ownership information.""" @@ -49,4 +45,16 @@ class NullAC(base.AccessControlBase): """Return nodes that are allowed to be created.""" return guids + def filter_read( + self, + node_type: schema.Node, + query: typing.Optional[ast.filter.FilterExpression] + ) -> typing.Optional[ast.filter.FilterExpression]: + """Re-write a filter *query* to get (i.e., read) *node_type* nodes.""" + return query + + def fetch_read(self, node_type: schema.Node, query: ast.fetch.FetchExpression) -> ast.fetch.FetchExpression: + """Re-write a fetch *query* to get (i.e, read) values for *node_type* nodes.""" + return query + ## EOF ## diff --git a/bsfs/graph/graph.py b/bsfs/graph/graph.py index b7b9f1c..1b4c212 100644 --- a/bsfs/graph/graph.py +++ b/bsfs/graph/graph.py @@ -1,20 +1,18 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import os import typing # bsfs imports -from bsfs.schema import Schema +from bsfs.query import ast, validate +from bsfs import schema as bsc from bsfs.triple_store import TripleStoreBase from bsfs.utils import URI, typename # inner-module imports +from . import ac from . import nodes as _nodes +from . import resolve # exports __all__: typing.Sequence[str] = ( @@ -25,9 +23,7 @@ __all__: typing.Sequence[str] = ( ## code ## class Graph(): - """The Graph class is - - The Graph class provides a convenient interface to query and access a graph. + """The Graph class provides a convenient interface to query and access a graph. Since it logically builds on the concept of graphs it is easier to navigate than raw triple stores. Naturally, it uses a triple store as *backend*. It also controls actions via access permissions to a *user*. @@ -37,35 +33,40 @@ class Graph(): # link to the triple storage backend. _backend: TripleStoreBase - # user uri. - _user: URI + # access controls. + _ac: ac.AccessControlBase - def __init__(self, backend: TripleStoreBase, user: URI): + def __init__( + self, + backend: TripleStoreBase, + access_control: ac.AccessControlBase, + ): + # store members self._backend = backend - self._user = user + self._ac = access_control # ensure Graph schema requirements self.migrate(self._backend.schema) def __hash__(self) -> int: - return hash((type(self), self._backend, self._user)) + return hash((type(self), self._backend, self._ac)) def __eq__(self, other) -> bool: return isinstance(other, type(self)) \ and self._backend == other._backend \ - and self._user == other._user + and self._ac == other._ac def __repr__(self) -> str: - return f'{typename(self)}(backend={repr(self._backend)}, user={self._user})' + return f'{typename(self)}({repr(self._backend)}, {self._ac})' def __str__(self) -> str: - return f'{typename(self)}({str(self._backend)}, {self._user})' + return f'{typename(self)}({str(self._backend)})' @property - def schema(self) -> Schema: + def schema(self) -> bsc.Schema: """Return the store's local schema.""" return self._backend.schema - def migrate(self, schema: Schema, append: bool = True) -> 'Graph': + def migrate(self, schema: bsc.Schema, append: bool = True) -> 'Graph': """Migrate the current schema to a new *schema*. Appends to the current schema by default; control this via *append*. @@ -73,14 +74,14 @@ class Graph(): """ # check args - if not isinstance(schema, Schema): + if not isinstance(schema, bsc.Schema): raise TypeError(schema) # append to current schema if append: schema = schema + self._backend.schema # add Graph schema requirements with open(os.path.join(os.path.dirname(__file__), 'schema.nt'), mode='rt', encoding='UTF-8') as ifile: - schema = schema + Schema.from_string(ifile.read()) + schema = schema + bsc.from_string(ifile.read()) # migrate schema in backend # FIXME: consult access controls! self._backend.schema = schema @@ -95,19 +96,69 @@ class Graph(): *node_type*) once some data is assigned to them. """ + # get node type type_ = self.schema.node(node_type) # NOTE: Nodes constructor materializes guids. - return _nodes.Nodes(self._backend, self._user, type_, guids) + return _nodes.Nodes(self._backend, self._ac, type_, guids) def node(self, node_type: URI, guid: URI) -> _nodes.Nodes: """Return node *guid* of type *node_type* as a `bsfs.graph.Nodes` instance. - Note that the *guids* need not to exist (however, the *node_type* has + Note that the *guid* need not to exist (however, the *node_type* has to be part of the schema). An inexistent guid will be created (using *node_type*) once some data is assigned to them. """ + return self.nodes(node_type, {guid}) + + def empty(self, node_type: URI) -> _nodes.Nodes: + """Return a `Nodes` instance with type *node_type* but no nodes.""" + return self.nodes(node_type, set()) + + def get( + self, + node_type: URI, + query: typing.Optional[ast.filter.FilterExpression], + ) -> _nodes.Nodes: + """Return a `Nodes` instance over all nodes of type *node_type* that match the *query*.""" + # return Nodes instance + type_ = self.schema.node(node_type) + return _nodes.Nodes(self._backend, self._ac, type_, self.__get(node_type, query)) + + def sorted( + self, + node_type: URI, + query: typing.Optional[ast.filter.FilterExpression], + # FIXME: sort ast + ) -> typing.Iterator[_nodes.Nodes]: + """Return a iterator over `Nodes` instances over all nodes of type *node_type* that match the *query*.""" + # FIXME: Order should be a parameter + # return iterator over Nodes instances + type_ = self.schema.node(node_type) + for guid in self.__get(node_type, query): + yield _nodes.Nodes(self._backend, self._ac, type_, {guid}) + + def all(self, node_type: URI) -> _nodes.Nodes: + """Return all instances of type *node_type*.""" + type_ = self.schema.node(node_type) + return _nodes.Nodes(self._backend, self._ac, type_, self.__get(node_type, None)) + + def __get( + self, + node_type: URI, + query: typing.Optional[ast.filter.FilterExpression], + ) -> typing.Iterator[URI]: + """Build and execute a get query.""" + # get node type type_ = self.schema.node(node_type) - return _nodes.Nodes(self._backend, self._user, type_, {guid}) + # resolve Nodes instances + query = resolve.Filter(self._backend.schema).resolve(type_, query) + # add access controls to query + query = self._ac.filter_read(type_, query) + # validate query + if query is not None: + validate.Filter(self._backend.schema).validate(type_, query) + # query the backend and return the (non-materialized) result + return self._backend.get(type_, query) ## EOF ## diff --git a/bsfs/graph/nodes.py b/bsfs/graph/nodes.py index c417a0e..47b0217 100644 --- a/bsfs/graph/nodes.py +++ b/bsfs/graph/nodes.py @@ -1,21 +1,20 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports +from collections import abc import time import typing # bsfs imports -from bsfs import schema as _schema +from bsfs import schema as bsc from bsfs.namespace import ns +from bsfs.query import ast, validate from bsfs.triple_store import TripleStoreBase from bsfs.utils import errors, URI, typename # inner-module imports from . import ac +from . import result +from . import walk # exports __all__: typing.Sequence[str] = ( @@ -26,18 +25,20 @@ __all__: typing.Sequence[str] = ( ## code ## class Nodes(): - """ + """Container for graph nodes, provides operations on nodes. + + NOTE: Should not be created directly but only via `bsfs.graph.Graph`. NOTE: guids may or may not exist. This is not verified as nodes are created on demand. """ # triple store backend. _backend: TripleStoreBase - # user uri. - _user: URI + # access controls. + _ac: ac.AccessControlBase # node type. - _node_type: _schema.Node + _node_type: bsc.Node # guids of nodes. Can be empty. _guids: typing.Set[URI] @@ -45,34 +46,35 @@ class Nodes(): def __init__( self, backend: TripleStoreBase, - user: URI, - node_type: _schema.Node, + access_control: ac.AccessControlBase, + node_type: bsc.Node, guids: typing.Iterable[URI], ): + # set main members self._backend = backend - self._user = user + self._ac = access_control self._node_type = node_type - self._guids = set(guids) - self.__ac = ac.NullAC(self._backend, self._user) + # convert to URI since this is not guaranteed by Graph + self._guids = {URI(guid) for guid in guids} def __eq__(self, other: typing.Any) -> bool: return isinstance(other, Nodes) \ and self._backend == other._backend \ - and self._user == other._user \ + and self._ac == other._ac \ and self._node_type == other._node_type \ and self._guids == other._guids def __hash__(self) -> int: - return hash((type(self), self._backend, self._user, self._node_type, tuple(sorted(self._guids)))) + return hash((type(self), self._backend, self._ac, self._node_type, tuple(sorted(self._guids)))) def __repr__(self) -> str: - return f'{typename(self)}({self._backend}, {self._user}, {self._node_type}, {self._guids})' + return f'{typename(self)}({self._backend}, {self._ac}, {self._node_type}, {self._guids})' def __str__(self) -> str: return f'{typename(self)}({self._node_type}, {self._guids})' @property - def node_type(self) -> _schema.Node: + def node_type(self) -> bsc.Node: """Return the node's type.""" return self._node_type @@ -81,9 +83,72 @@ class Nodes(): """Return all node guids.""" return iter(self._guids) + @property + def schema(self) -> bsc.Schema: + """Return the store's local schema.""" + return self._backend.schema + + def __add__(self, other: typing.Any) -> 'Nodes': + """Concatenate guids. Backend, AC, and node type must match.""" + if not isinstance(other, type(self)): + return NotImplemented + if self._backend != other._backend: + raise ValueError(other) + if self._ac != other._ac: + raise ValueError(other) + if self.node_type != other.node_type: + raise ValueError(other) + return Nodes(self._backend, self._ac, self.node_type, self._guids | other._guids) + + def __or__(self, other: typing.Any) -> 'Nodes': + """Concatenate guids. Backend, AC, and node type must match.""" + return self.__add__(other) + + def __sub__(self, other: typing.Any) -> 'Nodes': + """Subtract guids. Backend, AC, and node type must match.""" + if not isinstance(other, type(self)): + return NotImplemented + if self._backend != other._backend: + raise ValueError(other) + if self._ac != other._ac: + raise ValueError(other) + if self.node_type != other.node_type: + raise ValueError(other) + return Nodes(self._backend, self._ac, self.node_type, self._guids - other._guids) + + def __and__(self, other: typing.Any) -> 'Nodes': + """Intersect guids. Backend, AC, and node type must match.""" + if not isinstance(other, type(self)): + return NotImplemented + if self._backend != other._backend: + raise ValueError(other) + if self._ac != other._ac: + raise ValueError(other) + if self.node_type != other.node_type: + raise ValueError(other) + return Nodes(self._backend, self._ac, self.node_type, self._guids & other._guids) + + def __len__(self) -> int: + """Return the number of guids.""" + return len(self._guids) + + def __iter__(self) -> typing.Iterator['Nodes']: + """Iterate over individual guids. Returns `Nodes` instances.""" + return iter( + Nodes(self._backend, self._ac, self.node_type, {guid}) + for guid in self._guids + ) + + def __getattr__(self, name: str): + try: + return super().__getattr__(name) # type: ignore [misc] # parent has no getattr + except AttributeError: + pass + return walk.Walk(self, walk.Walk.step(self.schema, self.node_type, name)) + def set( self, - pred: URI, # FIXME: URI or _schema.Predicate? + pred: URI, # FIXME: URI or bsc.Predicate? value: typing.Any, ) -> 'Nodes': """Set predicate *pred* to *value*.""" @@ -91,7 +156,7 @@ class Nodes(): def set_from_iterable( self, - predicate_values: typing.Iterable[typing.Tuple[URI, typing.Any]], # FIXME: URI or _schema.Predicate? + predicate_values: typing.Iterable[typing.Tuple[URI, typing.Any]], # FIXME: URI or bsc.Predicate? ) -> 'Nodes': """Set mutliple predicate-value pairs at once.""" # TODO: Could group predicate_values by predicate to gain some efficiency @@ -105,7 +170,7 @@ class Nodes(): self._backend.commit() except ( - errors.PermissionDeniedError, # tried to set a protected predicate (ns.bsm.t_created) + errors.PermissionDeniedError, # tried to set a protected predicate errors.ConsistencyError, # node types are not in the schema or don't match the predicate errors.InstanceError, # guids/values don't have the correct type TypeError, # value is supposed to be a Nodes instance @@ -120,6 +185,126 @@ class Nodes(): return self + def get( + self, + *paths: typing.Union[URI, typing.Iterable[URI]], + view: typing.Union[typing.Type[list], typing.Type[dict]] = dict, + **view_kwargs, + ) -> typing.Any: + """Get values or nodes at *paths*. + Return an iterator (view=list) or a dict (view=dict) over the results. + """ + # FIXME: user-provided Fetch query AST? + # check args + if len(paths) == 0: + raise AttributeError('expected at least one path, found none') + if view not in (dict, list): + raise ValueError(f'expected dict or list, found {view}') + # process paths: create fetch ast, build name mapping, and find unique paths + schema = self.schema + statements = set() + name2path = {} + unique_paths = set() # paths that result in a single (unique) value + normpath: typing.Tuple[URI, ...] + for idx, path in enumerate(paths): + # normalize path + if isinstance(path, str): + normpath = (URI(path), ) + elif isinstance(path, abc.Iterable): + if not all(isinstance(step, str) for step in path): + raise TypeError(path) + normpath = tuple(URI(step) for step in path) + else: + raise TypeError(path) + # check path's schema consistency + if not all(schema.has_predicate(pred) for pred in normpath): + raise errors.ConsistencyError(f'path is not fully covered by the schema: {path}') + # check path's uniqueness + if all(schema.predicate(pred).unique for pred in normpath): + unique_paths.add(path) + # fetch tail predicate + tail = schema.predicate(normpath[-1]) + # determine tail ast node type + factory = ast.fetch.Node if isinstance(tail.range, bsc.Node) else ast.fetch.Value + # assign name + name = f'fetch{idx}' + name2path[name] = (path, tail) + # create tail ast node + curr: ast.fetch.FetchExpression = factory(tail.uri, name) + # walk towards front + hop: URI + for hop in normpath[-2::-1]: + curr = ast.fetch.Fetch(hop, curr) + # add to fetch query + statements.add(curr) + # aggregate fetch statements + if len(statements) == 1: + fetch = next(iter(statements)) + else: + fetch = ast.fetch.All(*statements) + # add access controls to fetch + fetch = self._ac.fetch_read(self.node_type, fetch) + + if len(self._guids) == 0: + # shortcut: no need to query; no triples + # FIXME: if the Fetch query can given by the user, we might want to check its validity + def triple_iter(): + return [] + else: + # compose filter ast + filter = ast.filter.IsIn(self.guids) # pylint: disable=redefined-builtin + # add access controls to filter + filter = self._ac.filter_read(self.node_type, filter) # type: ignore [assignment] + + # validate queries + validate.Filter(self._backend.schema).validate(self.node_type, filter) + validate.Fetch(self._backend.schema).validate(self.node_type, fetch) + + # process results, convert if need be + def triple_iter(): + # query the backend + triples = self._backend.fetch(self.node_type, filter, fetch) + # process triples + for root, name, raw in triples: + # get node + node = Nodes(self._backend, self._ac, self.node_type, {root}) + # get path + path, tail = name2path[name] + # covert raw to value + if isinstance(tail.range, bsc.Node): + value = Nodes(self._backend, self._ac, tail.range, {raw}) + else: + value = raw + # emit triple + yield node, path, value + + # simplify by default + view_kwargs['node'] = view_kwargs.get('node', len(self._guids) != 1) + view_kwargs['path'] = view_kwargs.get('path', len(paths) != 1) + view_kwargs['value'] = view_kwargs.get('value', False) + + # return results view + if view == list: + return result.to_list_view( + triple_iter(), + # aggregation args + **view_kwargs, + ) + + if view == dict: + return result.to_dict_view( + triple_iter(), + # context + len(self._guids) == 1, + len(paths) == 1, + unique_paths, + # aggregation args + **view_kwargs, + ) + + raise errors.UnreachableError() # view was already checked + + def __set(self, predicate: URI, value: typing.Any): """ """ @@ -135,7 +320,7 @@ class Nodes(): # FIXME: Needed? Could be integrated into other AC methods (by passing the predicate!) # This could allow more fine-grained predicate control (e.g. based on ownership) # rather than a global approach like this. - if self.__ac.is_protected_predicate(pred): + if self._ac.is_protected_predicate(pred): raise errors.PermissionDeniedError(pred) # set operation affects all nodes (if possible) @@ -145,11 +330,11 @@ class Nodes(): guids = set(self._ensure_nodes(node_type, guids)) # check value - if isinstance(pred.range, _schema.Literal): + if isinstance(pred.range, bsc.Literal): # check write permissions on existing nodes # As long as the user has write permissions, we don't restrict # the creation or modification of literal values. - guids = set(self.__ac.write_literal(node_type, guids)) + guids = set(self._ac.write_literal(node_type, guids)) # insert literals # TODO: Support passing iterators as values for non-unique predicates @@ -160,8 +345,9 @@ class Nodes(): [value], ) - elif isinstance(pred.range, _schema.Node): + elif isinstance(pred.range, bsc.Node): # check value type + # FIXME: value could be a set of Nodes if not isinstance(value, Nodes): raise TypeError(value) # value's node_type must be a subclass of the predicate's range @@ -172,14 +358,14 @@ class Nodes(): # Link permissions cover adding and removing links on the source node. # Specifically, link permissions also allow to remove links to other # nodes if needed (e.g. for unique predicates). - guids = set(self.__ac.link_from_node(node_type, guids)) + guids = set(self._ac.link_from_node(node_type, guids)) # get link targets targets = set(value.guids) # ensure existence of value nodes; create nodes if need be targets = set(self._ensure_nodes(value.node_type, targets)) # check link permissions on target nodes - targets = set(self.__ac.link_to_node(value.node_type, targets)) + targets = set(self._ac.link_to_node(value.node_type, targets)) # insert node links self._backend.set( @@ -192,7 +378,7 @@ class Nodes(): else: raise errors.UnreachableError() - def _ensure_nodes(self, node_type: _schema.Node, guids: typing.Iterable[URI]): + def _ensure_nodes(self, node_type: bsc.Node, guids: typing.Iterable[URI]): """ """ # check node existence @@ -203,14 +389,14 @@ class Nodes(): # create nodes if need be if len(missing) > 0: # check which missing nodes can be created - missing = set(self.__ac.createable(node_type, missing)) + missing = set(self._ac.createable(node_type, missing)) # create nodes self._backend.create(node_type, missing) # add bookkeeping triples self._backend.set(node_type, missing, - self._backend.schema.predicate(ns.bsm.t_created), [time.time()]) + self._backend.schema.predicate(ns.bsn.t_created), [time.time()]) # add permission triples - self.__ac.create(node_type, missing) + self._ac.create(node_type, missing) # return available nodes return existing | missing diff --git a/bsfs/graph/resolve.py b/bsfs/graph/resolve.py new file mode 100644 index 0000000..a58eb67 --- /dev/null +++ b/bsfs/graph/resolve.py @@ -0,0 +1,174 @@ + +# imports +import typing + +# bsfs imports +from bsfs import schema as bsc +from bsfs.query import ast +from bsfs.utils import errors + +# inner-module imports +from . import nodes + +# exports +__all__: typing.Sequence[str] = ( + 'Filter', + ) + + +## code ## + +class Filter(): + """Rewrites the query to replace `bsfs.graph.nodes.Nodes` instances with the respective URI. + Does only limited type checking and schema validation. + Use `bsfs.schema.validate.Filter` to do so. + + Example: + input: Any(ns.bse.tag, Is(Nodes(...))) + output: Any(ns.bse.tag, Or(Is(...), Is(...), ...))) + + >>> tags = graph.node(ns.bsn.Tag, 'http://example.com/me/tag#1234') + >>> graph.get(ns.bsn.Entity, ast.filter.Any(ns.bse.tag, ast.filter.Is(tags))) + + """ + + def __init__(self, schema): + self.schema = schema + + def __call__( + self, + root_type: bsc.Node, + node: typing.Optional[ast.filter.FilterExpression], + ): + """Alias for `Resolve.resolve`.""" + return self.resolve(root_type, node) + + def resolve( + self, + root_type: bsc.Node, + node: typing.Optional[ast.filter.FilterExpression], + ): + """Resolve Nodes instances of a *node* query starting at *root_type*.""" + if node is None: + return None + return self._parse_filter_expression(root_type, node) + + def _parse_filter_expression( + self, + type_: bsc.Vertex, + node: ast.filter.FilterExpression, + ) -> ast.filter.FilterExpression: + """Route *node* to the handler of the respective FilterExpression subclass.""" + if isinstance(node, ast.filter.Is): + return self._is(type_, node) + if isinstance(node, ast.filter.Not): + return self._not(type_, node) + if isinstance(node, ast.filter.Has): + return self._has(type_, node) + if isinstance(node, ast.filter.Any): + return self._any(type_, node) + if isinstance(node, ast.filter.All): + return self._all(type_, node) + if isinstance(node, ast.filter.And): + return self._and(type_, node) + if isinstance(node, ast.filter.Or): + return self._or(type_, node) + if isinstance(node, ast.filter.Distance): + return self._distance(type_, node) + if isinstance(node, (ast.filter.Equals, ast.filter.Substring, \ + ast.filter.StartsWith, ast.filter.EndsWith)): + return self._value(type_, node) + if isinstance(node, (ast.filter.LessThan, ast.filter.GreaterThan)): + return self._bounded(type_, node) + # invalid node + raise errors.BackendError(f'expected filter expression, found {node}') + + def _parse_predicate_expression(self, node: ast.filter.PredicateExpression) -> bsc.Vertex: + """Route *node* to the handler of the respective PredicateExpression subclass.""" + if isinstance(node, ast.filter.Predicate): + return self._predicate(node) + if isinstance(node, ast.filter.OneOf): + return self._one_of(node) + # invalid node + raise errors.BackendError(f'expected predicate expression, found {node}') + + def _predicate(self, node: ast.filter.Predicate) -> bsc.Vertex: + if not self.schema.has_predicate(node.predicate): + raise errors.ConsistencyError(f'predicate {node.predicate} is not in the schema') + pred = self.schema.predicate(node.predicate) + dom, rng = pred.domain, pred.range + if node.reverse: + dom, rng = rng, dom + return rng + + def _one_of(self, node: ast.filter.OneOf) -> bsc.Vertex: + # determine domain and range types + rng = None + for pred in node: + # parse child expression + subrng = self._parse_predicate_expression(pred) + # determine the next type + if rng is None or subrng > rng: # pick most generic range + rng = subrng + # check range consistency + if not subrng <= rng and not subrng >= rng: + raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related') + if not isinstance(rng, (bsc.Node, bsc.Literal)): + raise errors.BackendError(f'the range of node {node} is undefined') + return rng + + def _any(self, type_: bsc.Vertex, node: ast.filter.Any) -> ast.filter.Any: # pylint: disable=unused-argument + next_type = self._parse_predicate_expression(node.predicate) + return ast.filter.Any(node.predicate, self._parse_filter_expression(next_type, node.expr)) + + def _all(self, type_: bsc.Vertex, node: ast.filter.All) -> ast.filter.All: # pylint: disable=unused-argument + next_type = self._parse_predicate_expression(node.predicate) + return ast.filter.All(node.predicate, self._parse_filter_expression(next_type, node.expr)) + + def _and(self, type_: bsc.Vertex, node: ast.filter.And) -> ast.filter.And: + return ast.filter.And({self._parse_filter_expression(type_, expr) for expr in node}) + + def _or(self, type_: bsc.Vertex, node: ast.filter.Or) -> ast.filter.Or: + return ast.filter.Or({self._parse_filter_expression(type_, expr) for expr in node}) + + def _not(self, type_: bsc.Vertex, node: ast.filter.Not) -> ast.filter.Not: + return ast.filter.Not(self._parse_filter_expression(type_, node.expr)) + + def _has(self, type_: bsc.Vertex, node: ast.filter.Has) -> ast.filter.Has: # pylint: disable=unused-argument + return node + + def _distance(self, type_: bsc.Vertex, node: ast.filter.Distance): # pylint: disable=unused-argument + return node + + def _value(self, type_: bsc.Vertex, node: ast.filter._Value) -> ast.filter._Value: # pylint: disable=unused-argument + return node + + def _bounded(self, type_: bsc.Vertex, node: ast.filter._Bounded) -> ast.filter._Bounded: # pylint: disable=unused-argument + return node + + def _is(self, type_: bsc.Vertex, node: ast.filter.Is) -> typing.Union[ast.filter.Or, ast.filter.Is]: + # check if action is needed + if not isinstance(node.value, nodes.Nodes): + return node + # check schema consistency + if node.value.node_type not in self.schema.nodes(): + raise errors.ConsistencyError(f'node {node.value.node_type} is not in the schema') + # check type compatibility + if not isinstance(type_, bsc.Node): + raise errors.ConsistencyError(f'expected a node, found {type_}') + if not node.value.node_type <= type_: + raise errors.ConsistencyError(f'expected type {type_} or subtype thereof, found {node.value.node_type}') + # NOTE: We assume that the node type is checked when writing to the backend. + # Links to any of the guids can therefore only exist if the type matches. + # Hence, we don't add a type check/constrain here. + return ast.filter.Or(ast.filter.Is(guid) for guid in node.value.guids) + # optimized code, removing unnecessary ast.filter.Or + #guids = set(node.value.guids) + #if len(guids) == 0: + # raise errors.BackendError(f'') + #if len(guids) == 1: + # return ast.filter.Nodeid(next(iter(guids))) + #return ast.filter.Or(ast.filter.Is(guid) for guid in guids) + + +## EOF ## diff --git a/bsfs/graph/result.py b/bsfs/graph/result.py new file mode 100644 index 0000000..0fcbb13 --- /dev/null +++ b/bsfs/graph/result.py @@ -0,0 +1,119 @@ + +# imports +from collections import defaultdict +import typing + +# bsfs imports +from bsfs.utils import URI + +# exports +__all__: typing.Sequence[str] = ( + 'to_list_view', + 'to_dict_view', + ) + + +## code ## + +# FIXME: node, path, value seem counter-intuitive: +# node.get(..., node=True) removes the node part. +# wouldn't it make more sense if node=True keeps the node part +# and node=False drops it? + +def to_list_view( + triples, + # aggregators + node: bool, + path: bool, + value: bool, # pylint: disable=unused-argument + ): + """Return an iterator over results. + + Dependent on the *node*, *path*, and *value* flags, + the respective component is omitted. + + """ + if not node and not path: + return iter(val for _, _, val in triples) + if not node: + return iter((pred, val) for _, pred, val in triples) + if not path: + return iter((subj, val) for subj, _, val in triples) + return iter((subj, pred, val) for subj, pred, val in triples) + + +def to_dict_view( + triples, + # context + one_node: bool, + one_path: bool, + unique_paths: typing.Set[typing.Union[URI, typing.Iterable[URI]]], + # aggregators + node: bool, + path: bool, + value: bool, + default: typing.Optional[typing.Any] = None, + ) -> typing.Any: + """Return a dict of results. + + Note that triples are materialized to create this view. + + The returned structure depends on the *node*, *path*, and *value* flags. + If all flags are set to False, returns a dict(node -> dict(path -> set(values))). + Setting a flag to true omits or simplifies the respective component (if possible). + + """ + # NOTE: To create a dict, we need to materialize or make further assumptions + # (e.g., sorted in a specific order). + + data: typing.Any # disable type checks on data since it's very flexibly typed. + + # FIXME: type of data can be overwritten later on (if value) + + if not node and not path: + data = set() + elif node ^ path: + data = defaultdict(set) + else: + data = defaultdict(lambda: defaultdict(set)) + + for subj, pred, val in triples: + unique = pred in unique_paths + if not node and not path: + if not value and unique and one_node and one_path: + return val + data.add(val) + elif not node: + # remove node from result, group by predicate + if not value and unique and one_node: + data[pred] = val + else: + data[pred].add(val) + elif not path: + # remove predicate from result, group by node + if not value and unique and one_path: + data[subj] = val + else: + data[subj].add(val) + else: + if not value and unique: + data[subj][pred] = val + else: + data[subj][pred].add(val) + + # FIXME: Combine multiple Nodes instances into one? + + # convert defaultdict to ordinary dict + # pylint: disable=too-many-boolean-expressions + if not node and not path and not value \ + and len(unique_paths) > 0 and one_node and one_path \ + and len(data) == 0: + return default + # pylint: enable=too-many-boolean-expressions + if not node and not path: + return data + if node ^ path: + return dict(data) + return {key: dict(val) for key, val in data.items()} + +## EOF ## diff --git a/bsfs/graph/schema.nt b/bsfs/graph/schema.nt index 8612681..37bba5e 100644 --- a/bsfs/graph/schema.nt +++ b/bsfs/graph/schema.nt @@ -4,15 +4,17 @@ prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> prefix xsd: <http://www.w3.org/2001/XMLSchema#> # bsfs prefixes -prefix bsfs: <http://bsfs.ai/schema/> -prefix bsm: <http://bsfs.ai/schema/Meta#> +prefix bsfs: <https://schema.bsfs.io/core/> +prefix bsl: <https://schema.bsfs.io/core/Literal/> +prefix bsn: <https://schema.bsfs.io/core/Node#> # literals -xsd:integer rdfs:subClassOf bsfs:Literal . +bsl:Number rdfs:subClassOf bsfs:Literal . +xsd:float rdfs:subClassOf bsl:Number . # predicates -bsm:t_created rdfs:subClassOf bsfs:Predicate ; +bsn:t_created rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsfs:Node ; - rdfs:range xsd:integer ; + rdfs:range xsd:float ; bsfs:unique "true"^^xsd:boolean . diff --git a/bsfs/graph/walk.py b/bsfs/graph/walk.py new file mode 100644 index 0000000..6415c9b --- /dev/null +++ b/bsfs/graph/walk.py @@ -0,0 +1,115 @@ + +# imports +from collections import abc +import typing + +# bsfs imports +from bsfs import schema as bsc + +# inner-module imports +# NOTE: circular import! OK as long as only used for type annotations. +from . import nodes # pylint: disable=cyclic-import + +# exports +__all__: typing.Sequence[str] = ( + 'Walk', + ) + + +## code ## + +class Walk(abc.Hashable, abc.Callable): # type: ignore [misc] # invalid base class (Callable) + """Syntactic sugar for `Nodes` to build and act on predicate paths via members.""" + + # Link to Nodes instance. + _root: 'nodes.Nodes' + + # Current predicate path. + _path: typing.Tuple[bsc.Predicate, ...] + + def __init__( + self, + root: 'nodes.Nodes', + path: typing.Sequence[bsc.Predicate], + ): + self._root = root + self._path = tuple(path) + + @property + def tail(self): + """Return the node type at the end of the path.""" + return self._path[-1].range + + + ## comparison + + def __hash__(self) -> int: + """Return an integer hash that identifies the instance.""" + return hash((type(self), self._root, self._path)) + + def __eq__(self, other) -> bool: + """Compare against *other* backend.""" + return isinstance(other, type(self)) \ + and self._root == other._root \ + and self._path == other._path + + + ## representation + + def __repr__(self) -> str: + """Return a formal string representation.""" + path = ', '.join(pred.uri for pred in self._path) + return f'Walk({self._root.node_type.uri}, ({path}))' + + def __str__(self) -> str: + """Return an informal string representation.""" + path = ', '.join(pred.uri for pred in self._path) + return f'Walk(@{self._root.node_type.uri}: {path})' + + + ## walk + + @staticmethod + def step( + schema: bsc.Schema, + node: bsc.Node, + name: str, + ) -> typing.Tuple[bsc.Predicate]: + """Get an predicate at *node* whose fragment matches *name*.""" + predicates = tuple( + pred + for pred + in schema.predicates_at(node) + if pred.uri.get('fragment', None) == name + ) + if len(predicates) == 0: # no fragment found for name + raise ValueError(f'no available predicate matches {name}') # FIXME: Custom exception + if len(predicates) > 1: # ambiguous name + raise ValueError(f'{name} matches multiple predicates') # FIXME: Custom exception + # append predicate to walk + return predicates # type: ignore [return-value] # size is one + + def __getattr__(self, name: str) -> 'Walk': + """Alias for `Walk.step(name)`.""" + try: + return super().__getattr__(name) + except AttributeError: + pass + # get predicate + pred = self.step(self._root.schema, self.tail, name) + # append predicate to walk + return Walk(self._root, self._path + pred) + + + ## get paths ## + + def get(self, **kwargs) -> typing.Any: + """Alias for `Nodes.get(..)`.""" + return self._root.get(tuple(pred.uri for pred in self._path), **kwargs) + + def __call__(self, **kwargs) -> typing.Any: # pylint: disable=arguments-differ + """Alias for `Walk.get(...)`.""" + return self.get(**kwargs) + + +## EOF ## diff --git a/bsfs/namespace/__init__.py b/bsfs/namespace/__init__.py index 98d472f..76f39a2 100644 --- a/bsfs/namespace/__init__.py +++ b/bsfs/namespace/__init__.py @@ -1,19 +1,13 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import typing # inner-module imports from . import predefined as ns -from .namespace import ClosedNamespace, Namespace +from .namespace import Namespace # exports __all__: typing.Sequence[str] = ( - 'ClosedNamespace', 'Namespace', 'ns', ) diff --git a/bsfs/namespace/namespace.py b/bsfs/namespace/namespace.py index f652dcd..b388f53 100644 --- a/bsfs/namespace/namespace.py +++ b/bsfs/namespace/namespace.py @@ -1,104 +1,54 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import typing # bsfs imports -from bsfs.utils import URI, typename +from bsfs.utils import URI # exports __all__: typing.Sequence[str] = ( - 'ClosedNamespace', 'Namespace', + 'FinalNamespace', ) ## code ## -class Namespace(): - """A namespace consists of a common prefix that is used in a set of URIs. - Note that the prefix must include the separator between - path and fragment (typically a '#' or a '/'). - """ - - # namespace prefix. - prefix: URI - - # fragment separator. - fsep: str - - # path separator. - psep: str - - def __init__(self, prefix: URI, fsep: str = '#', psep: str = '/'): - # ensure prefix type - prefix = URI(prefix) - # truncate fragment separator - while prefix.endswith(fsep): - prefix = URI(prefix[:-1]) - # truncate path separator - while prefix.endswith(psep): - prefix = URI(prefix[:-1]) - # store members - self.prefix = prefix - self.fsep = fsep - self.psep = psep - - def __eq__(self, other: typing.Any) -> bool: - return isinstance(other, type(self)) \ - and self.prefix == other.prefix \ - and self.fsep == other.fsep \ - and self.psep == other.psep +class Namespace(URI): + """The Namespace allows you to incrementally append path segments to an URI. - def __hash__(self) -> int: - return hash((type(self), self.prefix, self.fsep, self.psep)) + Segments are separated by `Namespace.sep` ('/'). + The `__call__` method signals that the URI is complete until the query part. - def __str__(self) -> str: - return f'{typename(self)}({self.prefix})' - - def __repr__(self) -> str: - return f'{typename(self)}({self.prefix}, {self.fsep}, {self.psep})' - - def __getattr__(self, fragment: str) -> URI: - """Return prefix + fragment.""" - return URI(self.prefix + self.fsep + fragment) - - def __getitem__(self, fragment: str) -> URI: - """Alias for getattr(self, fragment).""" - return self.__getattr__(fragment) + """ - def __add__(self, value: typing.Any) -> 'Namespace': - """Concatenate another namespace to this one.""" - if not isinstance(value, str): - return NotImplemented - return Namespace(self.prefix + self.psep + value, self.fsep, self.psep) + # path separator + sep: str = '/' + def __getattr__(self, query: str) -> 'Namespace': + """Append the *query* to the current value and return as Namespace.""" + return Namespace(self + self.sep + query) -class ClosedNamespace(Namespace): - """Namespace that covers a restricted set of URIs.""" + def __call__(self, sep: str = '#') -> 'FinalNamespace': + """Finalize the namespace.""" + return FinalNamespace(self, sep) - # set of permissible fragments. - fragments: typing.Set[str] - def __init__(self, prefix: URI, *args: str, fsep: str = '#', psep: str = '/'): - super().__init__(prefix, fsep, psep) - self.fragments = set(args) +# FIXME: Integrate FinalNamespace into Namespace? Do we need to have both? +class FinalNamespace(URI): + """The FinalNamespace allows you to append a fragment to an URI.""" - def __eq__(self, other: typing.Any) -> bool: - return super().__eq__(other) and self.fragments == other.fragments + # fragment separator + sep: str - def __hash__(self) -> int: - return hash((type(self), self.prefix, tuple(sorted(self.fragments)))) + def __new__(cls, value: str, sep: str = '#'): + inst = URI.__new__(cls, value) + inst.sep = sep + return inst def __getattr__(self, fragment: str) -> URI: - """Return prefix + fragment or raise a KeyError if the fragment is not part of this namespace.""" - if fragment not in self.fragments: - raise KeyError(f'{fragment} is not a valid fragment of namespace {self.prefix}') - return super().__getattr__(fragment) + """Append the *fragment* to the current value and return as URI.""" + return URI(self + self.sep + fragment) ## EOF ## diff --git a/bsfs/namespace/predefined.py b/bsfs/namespace/predefined.py index cd48a46..8b60d39 100644 --- a/bsfs/namespace/predefined.py +++ b/bsfs/namespace/predefined.py @@ -1,35 +1,29 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import typing -# bsfs imports -from bsfs.utils import URI - # inner-module imports -from . import namespace +from .namespace import Namespace, FinalNamespace # essential bsfs namespaces -bsfs: namespace.Namespace = namespace.Namespace(URI('http://bsfs.ai/schema'), fsep='/') - +bsfs = Namespace('https://schema.bsfs.io/core') # additional bsfs namespaces -bse: namespace.Namespace = namespace.Namespace(URI('http://bsfs.ai/schema/Entity')) -bsm: namespace.Namespace = namespace.Namespace(URI('http://bsfs.ai/schema/Meta')) +bsd = bsfs.distance() +bsl = bsfs.Literal +bsn = bsfs.Node() # generic namespaces -rdf: namespace.Namespace = namespace.Namespace(URI('http://www.w3.org/1999/02/22-rdf-syntax-ns')) -rdfs: namespace.Namespace = namespace.Namespace(URI('http://www.w3.org/2000/01/rdf-schema')) -schema: namespace.Namespace = namespace.Namespace(URI('http://schema.org'), fsep='/') -xsd: namespace.Namespace = namespace.Namespace(URI('http://www.w3.org/2001/XMLSchema')) +rdf = FinalNamespace('http://www.w3.org/1999/02/22-rdf-syntax-ns') +rdfs = FinalNamespace('http://www.w3.org/2000/01/rdf-schema') +xsd = FinalNamespace('http://www.w3.org/2001/XMLSchema') +schema = FinalNamespace('http://schema.org', sep='/') +# exports __all__: typing.Sequence[str] = ( - 'bse', + 'bsd', 'bsfs', - 'bsm', + 'bsl', + 'bsn', 'rdf', 'rdfs', 'schema', diff --git a/bsfs/query/__init__.py b/bsfs/query/__init__.py new file mode 100644 index 0000000..58ff03a --- /dev/null +++ b/bsfs/query/__init__.py @@ -0,0 +1,15 @@ + +# imports +import typing + +# inner-module imports +from . import ast +from . import validator as validate + +# exports +__all__: typing.Sequence[str] = ( + 'ast', + 'validate', + ) + +## EOF ## diff --git a/bsfs/query/ast/__init__.py b/bsfs/query/ast/__init__.py new file mode 100644 index 0000000..bceaac0 --- /dev/null +++ b/bsfs/query/ast/__init__.py @@ -0,0 +1,23 @@ +"""Query AST components. + +The query AST consists of a Filter and a Fetch syntax trees. + +Classes beginning with an underscore (_) represent internal type hierarchies +and should not be used for parsing. Note that the AST structures do not +(and cannot) check semantic validity or consistency with a given schema. + +""" +# imports +import typing + +# inner-module imports +from . import fetch +from . import filter_ as filter # pylint: disable=redefined-builtin + +# exports +__all__: typing.Sequence[str] = ( + 'fetch', + 'filter', + ) + +## EOF ## diff --git a/bsfs/query/ast/fetch.py b/bsfs/query/ast/fetch.py new file mode 100644 index 0000000..66d94e1 --- /dev/null +++ b/bsfs/query/ast/fetch.py @@ -0,0 +1,169 @@ + +# imports +from collections import abc +import typing + +# bsfs imports +from bsfs.utils import URI, typename, normalize_args + +# exports +__all__ : typing.Sequence[str] = ( + 'All', + 'Fetch', + 'FetchExpression', + 'Node', + 'This', + 'Value', + ) + + +## code ## + +class FetchExpression(abc.Hashable): + """Generic Fetch expression.""" + + def __repr__(self) -> str: + """Return the expressions's string representation.""" + return f'{typename(self)}()' + + def __hash__(self) -> int: + """Return the expression's integer representation.""" + return hash(type(self)) + + def __eq__(self, other: typing.Any) -> bool: + """Return True if *self* and *other* are equivalent.""" + return isinstance(other, type(self)) + + +class All(FetchExpression): + """Fetch all child expressions.""" + + # child expressions. + expr: typing.Set[FetchExpression] + + def __init__(self, *expr): + # unpack child expressions + unfolded = set(normalize_args(*expr)) + # check child expressions + if len(unfolded) == 0: + raise AttributeError('expected at least one expression, found none') + if not all(isinstance(itm, FetchExpression) for itm in unfolded): + raise TypeError(expr) + # initialize + super().__init__() + # assign members + self.expr = unfolded + + def __iter__(self) -> typing.Iterator[FetchExpression]: + return iter(self.expr) + + def __len__(self) -> int: + return len(self.expr) + + def __repr__(self) -> str: + return f'{typename(self)}({self.expr})' + + def __hash__(self) -> int: + return hash((super().__hash__(), tuple(sorted(self.expr, key=repr)))) + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) and self.expr == other.expr + + +class _Branch(FetchExpression): + """Branch along a predicate.""" + + # FIXME: Use a Predicate (like in ast.filter) so that we can also reverse them! + + # predicate to follow. + predicate: URI + + def __init__(self, predicate: URI): + if not isinstance(predicate, URI): + raise TypeError(predicate) + self.predicate = predicate + + def __repr__(self) -> str: + return f'{typename(self)}({self.predicate})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.predicate)) + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) and self.predicate == other.predicate + + +class Fetch(_Branch): + """Follow a predicate before evaluating a child epxression.""" + + # child expression. + expr: FetchExpression + + def __init__(self, predicate: URI, expr: FetchExpression): + # check child expressions + if not isinstance(expr, FetchExpression): + raise TypeError(expr) + # initialize + super().__init__(predicate) + # assign members + self.expr = expr + + def __repr__(self) -> str: + return f'{typename(self)}({self.predicate}, {self.expr})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.expr)) + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) and self.expr == other.expr + + +class _Named(_Branch): + """Fetch a (named) symbol at a predicate.""" + + # symbol name. + name: str + + def __init__(self, predicate: URI, name: str): + super().__init__(predicate) + self.name = str(name) + + def __repr__(self) -> str: + return f'{typename(self)}({self.predicate}, {self.name})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.name)) + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) and self.name == other.name + + +class Node(_Named): # pylint: disable=too-few-public-methods + """Fetch a Node at a predicate.""" + # FIXME: Is this actually needed? + + +class Value(_Named): # pylint: disable=too-few-public-methods + """Fetch a Literal at a predicate.""" + + +class This(FetchExpression): + """Fetch the current Node.""" + + # symbol name. + name: str + + def __init__(self, name: str): + super().__init__() + self.name = str(name) + + def __repr__(self) -> str: + return f'{typename(self)}({self.name})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.name)) + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) and self.name == other.name + +## EOF ## diff --git a/bsfs/query/ast/filter_.py b/bsfs/query/ast/filter_.py new file mode 100644 index 0000000..610fdb4 --- /dev/null +++ b/bsfs/query/ast/filter_.py @@ -0,0 +1,516 @@ +"""Filter AST. + +Note that it is easily possible to construct an AST that is inconsistent with +a given schema. Furthermore, it is possible to construct a semantically invalid +AST which that cannot be parsed correctly or includes contradicting statements. +The AST nodes do not (and cannot) check such issues. + +For example, consider the following AST: + +>>> Any(ns.bse.collection, +... And( +... Equals('hello'), +... Is('hello world'), +... Any(ns.bse.tag, Equals('world')), +... Any(ns.bst.label, Equals('world')), +... All(ns.bst.label, Not(Equals('world'))), +... ) +... ) + +This AST has multiple issues that are not verified upon its creation: +* A condition on a non-literal. +* A Filter on a literal. +* Conditions exclude each other +* The predicate along the branch have incompatible domains and ranges. + +""" +# imports +from collections import abc +import typing + +# bsfs imports +from bsfs.utils import URI, typename, normalize_args + +# exports +__all__ : typing.Sequence[str] = ( + # base classes + 'FilterExpression', + 'PredicateExpression', + # predicate expressions + 'OneOf', + 'Predicate', + # branching + 'All', + 'Any', + # aggregators + 'And', + 'Or', + # value matchers + 'Equals', + 'Substring', + 'EndsWith', + 'StartsWith', + # range matchers + 'GreaterThan', + 'LessThan', + # misc + 'Has', + 'Is', + 'Not', + ) + + +## code ## + +# pylint: disable=too-few-public-methods # Many expressions use mostly magic methods + +class _Expression(abc.Hashable): + def __repr__(self) -> str: + """Return the expressions's string representation.""" + return f'{typename(self)}()' + + def __hash__(self) -> int: + """Return the expression's integer representation.""" + return hash(type(self)) + + def __eq__(self, other: typing.Any) -> bool: + """Return True if *self* and *other* are equivalent.""" + return isinstance(other, type(self)) + + +class FilterExpression(_Expression): + """Generic Filter expression.""" + + +class PredicateExpression(_Expression): + """Generic Predicate expression.""" + + +class _Branch(FilterExpression): + """Branch the filter along a predicate.""" + + # predicate to follow. + predicate: PredicateExpression + + # child expression to evaluate. + expr: FilterExpression + + def __init__( + self, + predicate: typing.Union[PredicateExpression, URI], + expr: FilterExpression, + ): + # process predicate argument + if isinstance(predicate, URI): + predicate = Predicate(predicate) + elif not isinstance(predicate, PredicateExpression): + raise TypeError(predicate) + # process expression argument + if not isinstance(expr, FilterExpression): + raise TypeError(expr) + # assign members + self.predicate = predicate + self.expr = expr + + def __repr__(self) -> str: + return f'{typename(self)}({self.predicate}, {self.expr})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.predicate, self.expr)) + + def __eq__(self, other) -> bool: + return super().__eq__(other) \ + and self.predicate == other.predicate \ + and self.expr == other.expr + +class Any(_Branch): + """Any (and at least one) triple matches.""" + + +class All(_Branch): + """All (and at least one) triples match.""" + + +class _Agg(FilterExpression, abc.Collection): + """Combine multiple expressions.""" + + # child expressions + expr: typing.Set[FilterExpression] + + def __init__( + self, + *expr: typing.Union[FilterExpression, + typing.Iterable[FilterExpression], + typing.Iterator[FilterExpression]] + ): + # unfold arguments + unfolded = set(normalize_args(*expr)) + # check type + if not all(isinstance(e, FilterExpression) for e in unfolded): + raise TypeError(expr) + # FIXME: Require at least one child expression? + # assign member + self.expr = unfolded + + def __contains__(self, expr: typing.Any) -> bool: + """Return True if *expr* is among the child expressions.""" + return expr in self.expr + + def __iter__(self) -> typing.Iterator[FilterExpression]: + """Iterator over child expressions.""" + return iter(self.expr) + + def __len__(self) -> int: + """Number of child expressions.""" + return len(self.expr) + + def __repr__(self) -> str: + return f'{typename(self)}({self.expr})' + + def __hash__(self) -> int: + return hash((super().__hash__(), tuple(sorted(self.expr, key=repr)))) + + def __eq__(self, other) -> bool: + return super().__eq__(other) and self.expr == other.expr + + +class And(_Agg): + """All conditions match.""" + + +class Or(_Agg): + """At least one condition matches.""" + + +class Not(FilterExpression): + """Invert a statement.""" + + # child expression + expr: FilterExpression + + def __init__(self, expr: FilterExpression): + # check argument + if not isinstance(expr, FilterExpression): + raise TypeError(expr) + # assign member + self.expr = expr + + def __repr__(self) -> str: + return f'{typename(self)}({self.expr})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.expr)) + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) and self.expr == other.expr + + +class Has(FilterExpression): + """Has predicate N times""" + + # predicate to follow. + predicate: PredicateExpression + + # target count + count: FilterExpression + + def __init__( + self, + predicate: typing.Union[PredicateExpression, URI], + count: typing.Optional[typing.Union[FilterExpression, int]] = None, + ): + # check predicate + if isinstance(predicate, URI): + predicate = Predicate(predicate) + elif not isinstance(predicate, PredicateExpression): + raise TypeError(predicate) + # check count + if count is None: + count = GreaterThan(1, strict=False) + elif isinstance(count, int): + count = Equals(count) + elif not isinstance(count, FilterExpression): + raise TypeError(count) + # assign members + self.predicate = predicate + self.count = count + + def __repr__(self) -> str: + return f'{typename(self)}({self.predicate}, {self.count})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.predicate, self.count)) + + def __eq__(self, other) -> bool: + return super().__eq__(other) \ + and self.predicate == other.predicate \ + and self.count == other.count + + +class _Value(FilterExpression): + """Matches some value.""" + + # target value. + value: typing.Any + + def __init__(self, value: typing.Any): + self.value = value + + def __repr__(self) -> str: + return f'{typename(self)}({self.value})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.value)) + + def __eq__(self, other) -> bool: + return super().__eq__(other) and self.value == other.value + + +class Is(_Value): + """Match the URI of a node.""" + + +class Equals(_Value): + """Value matches exactly. + NOTE: Value must correspond to literal type. + """ + + +class Substring(_Value): + """Value matches a substring + NOTE: value must be a string. + """ + + +class StartsWith(_Value): + """Value begins with a given string.""" + + +class EndsWith(_Value): + """Value ends with a given string.""" + + +class Distance(FilterExpression): + """Distance to a reference is (strictly) below a threshold. Assumes a Feature literal.""" + + # FIXME: + # (a) pass a node/predicate as anchor instead of a value. + # Then we don't need to materialize the reference. + # (b) pass a FilterExpression (_Bounded) instead of a threshold. + # Then, we could also query values greater than a threshold. + + # reference value. + reference: typing.Any + + # distance threshold. + threshold: float + + # closed (True) or open (False) bound. + strict: bool + + def __init__( + self, + reference: typing.Any, + threshold: float, + strict: bool = False, + ): + self.reference = reference + self.threshold = float(threshold) + self.strict = bool(strict) + + def __repr__(self) -> str: + return f'{typename(self)}({self.reference}, {self.threshold}, {self.strict})' + + def __hash__(self) -> int: + return hash((super().__hash__(), tuple(self.reference), self.threshold, self.strict)) + + def __eq__(self, other) -> bool: + return super().__eq__(other) \ + and self.reference == other.reference \ + and self.threshold == other.threshold \ + and self.strict == other.strict + + +class _Bounded(FilterExpression): + """Value is bounded by a threshold. Assumes a Number literal.""" + + # bound. + threshold: float + + # closed (True) or open (False) bound. + strict: bool + + def __init__( + self, + threshold: float, + strict: bool = True, + ): + self.threshold = float(threshold) + self.strict = bool(strict) + + def __repr__(self) -> str: + return f'{typename(self)}({self.threshold}, {self.strict})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.threshold, self.strict)) + + def __eq__(self, other) -> bool: + return super().__eq__(other) \ + and self.threshold == other.threshold \ + and self.strict == other.strict + + + +class LessThan(_Bounded): + """Value is (strictly) smaller than threshold. Assumes a Number literal.""" + + +class GreaterThan(_Bounded): + """Value is (strictly) larger than threshold. Assumes a Number literal.""" + + +class Predicate(PredicateExpression): + """A single predicate.""" + + # predicate URI + predicate: URI + + # reverse the predicate's direction + reverse: bool + + def __init__( + self, + predicate: URI, + reverse: typing.Optional[bool] = False, + ): + # check arguments + if not isinstance(predicate, URI): + raise TypeError(predicate) + # assign members + self.predicate = predicate + self.reverse = bool(reverse) + + def __repr__(self) -> str: + return f'{typename(self)}({self.predicate}, {self.reverse})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.predicate, self.reverse)) + + def __eq__(self, other) -> bool: + return super().__eq__(other) \ + and self.predicate == other.predicate \ + and self.reverse == other.reverse + + +class OneOf(PredicateExpression, abc.Collection): + """A set of predicate alternatives. + + The predicates' domains must be ascendants or descendants of each other. + The overall domain is the most specific one. + + The predicate's domains must be ascendants or descendants of each other. + The overall range is the most generic one. + """ + + # predicate alternatives + expr: typing.Set[PredicateExpression] + + def __init__(self, *expr: typing.Union[PredicateExpression, URI]): + # unfold arguments + unfolded = set(normalize_args(*expr)) # type: ignore [arg-type] # this is getting too complex... + # check arguments + if len(unfolded) == 0: + raise AttributeError('expected at least one expression, found none') + # ensure PredicateExpression + unfolded = {Predicate(e) if isinstance(e, URI) else e for e in unfolded} + # check type + if not all(isinstance(e, PredicateExpression) for e in unfolded): + raise TypeError(expr) + # assign member + self.expr = unfolded + + def __contains__(self, expr: typing.Any) -> bool: + """Return True if *expr* is among the child expressions.""" + return expr in self.expr + + def __iter__(self) -> typing.Iterator[PredicateExpression]: + """Iterator over child expressions.""" + return iter(self.expr) + + def __len__(self) -> int: + """Number of child expressions.""" + return len(self.expr) + + def __repr__(self) -> str: + return f'{typename(self)}({self.expr})' + + def __hash__(self) -> int: + return hash((super().__hash__(), tuple(sorted(self.expr, key=repr)))) + + def __eq__(self, other) -> bool: + return super().__eq__(other) and self.expr == other.expr + + +# Helpers +# invalid-name is disabled since they explicitly mimic an expression + +def IsIn(*values) -> FilterExpression: # pylint: disable=invalid-name + """Match any of the given URIs.""" + args = normalize_args(*values) + if len(args) == 0: + raise AttributeError('expected at least one value, found none') + if len(args) == 1: + return Is(args[0]) + return Or(Is(value) for value in args) + +def IsNotIn(*values) -> FilterExpression: # pylint: disable=invalid-name + """Match none of the given URIs.""" + return Not(IsIn(*values)) + + +def Between( # pylint: disable=invalid-name + lo: float = float('-inf'), + hi: float = float('inf'), + lo_strict: bool = True, + hi_strict: bool = True, + ) -> FilterExpression : + """Match numerical values between *lo* and *hi*. Include bounds if strict is False.""" + if abs(lo) == hi == float('inf'): + raise ValueError('range cannot be INF on both sides') + if lo > hi: + raise ValueError(f'lower bound ({lo}) cannot be less than upper bound ({hi})') + if lo == hi and not lo_strict and not hi_strict: + return Equals(lo) + if lo == hi: # either bound is strict + raise ValueError('bounds cannot be equal when either is strict') + if lo != float('-inf') and hi != float('inf'): + return And(GreaterThan(lo, lo_strict), LessThan(hi, hi_strict)) + if lo != float('-inf'): + return GreaterThan(lo, lo_strict) + # hi != float('inf'): + return LessThan(hi, hi_strict) + + +def Includes(*values, approx: bool = False) -> FilterExpression: # pylint: disable=invalid-name + """Match any of the given *values*. Uses `Substring` if *approx* is set.""" + args = normalize_args(*values) + cls = Substring if approx else Equals + if len(args) == 0: + raise AttributeError('expected at least one value, found none') + if len(args) == 1: + return cls(args[0]) + return Or(cls(v) for v in args) + + +def Excludes(*values, approx: bool = False) -> FilterExpression: # pylint: disable=invalid-name + """Match none of the given *values*. Uses `Substring` if *approx* is set.""" + args = normalize_args(*values) + cls = Substring if approx else Equals + if len(args) == 0: + raise AttributeError('expected at least one value, found none') + if len(args) == 1: + return Not(cls(args[0])) + return Not(Or(cls(v) for v in args)) + + +## EOF ## diff --git a/bsfs/query/matcher.py b/bsfs/query/matcher.py new file mode 100644 index 0000000..17c9c8e --- /dev/null +++ b/bsfs/query/matcher.py @@ -0,0 +1,361 @@ + +# imports +from collections import defaultdict +from itertools import product +from time import time +import random +import threading +import typing + +# external imports +from hopcroftkarp import HopcroftKarp + +# bsfs imports +from bsfs.utils import errors, typename + +# inner-module imports +from . import ast + +# exports +__all__ : typing.Sequence[str] = ( + 'Filter', + ) + + +## code ## + +class Any(ast.filter.FilterExpression, ast.filter.PredicateExpression): + """Match any ast class. + + Note that Any instances are unique, i.e. they do not compare, and + can hence be repeated in a set: + >>> Any() == Any() + False + >>> len({Any(), Any(), Any(), Any()}) + 4 + + """ + + # unique instance id + _uid: typing.Tuple[int, int, float, float] + + def __init__(self): + self._uid = ( + id(self), + id(threading.current_thread()), + time(), + random.random(), + ) + + def __eq__(self, other: typing.Any): + return super().__eq__(other) and self._uid == other._uid + + def __hash__(self): + return hash((super().__hash__(), self._uid)) + + +class Rest(ast.filter.FilterExpression, ast.filter.PredicateExpression): + """Match the leftovers in a set of items to be compared. + + Rest can be used in junction with aggregating expressions such as ast.filter.And, + ast.filter.Or, ast.filter.OneOf. It controls childs expressions that were not yet + consumed by other matching rules. Rest may match to only a specific expression. + The expresssion defaults to Any(). + + For example, the following to ast structures would match since Rest + allows an arbitrary repetition of ast.filter.Equals statements. + + >>> And(Equals('hello'), Equals('world'), Equals('foobar')) + >>> And(Equals('world'), Rest(Partial(Equals))) + + """ + + # child expression for the Rest. + expr: typing.Union[ast.filter.FilterExpression, ast.filter.PredicateExpression] + + def __init__( + self, + expr: typing.Optional[typing.Union[ast.filter.FilterExpression, ast.filter.PredicateExpression]] = None, + ): + if expr is None: + expr = Any() + self.expr = expr + + def __repr__(self) -> str: + return f'{typename(self)}({self.expr})' + + def __hash__(self) -> int: + return hash((super().__hash__(), self.expr)) + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) and self.expr == other.expr + + +class Partial(ast.filter.FilterExpression, ast.filter.PredicateExpression): + """Match a partially defined ast expression. + + Literal values might be irrelevant or unknown when comparing two ast + structures. Partial allows to constrain the matcher to a certain + ast class, while leaving some of its members unspecified. + + Pass the class (not instance) and its members as keyword arguments + to Partial. Note that the arguments are not validated. + + For example, the following instance matches any ast.filter.Equals, + irrespective of its value: + + >>> Partial(ast.filter.Equals) + + Likewise, the following instance matches any ast.filter.LessThan + that has a strict bounds, but makes no claim about the threshold: + + >>> Partial(ast.filter.LessThan, strict=False) + + """ + + # target node type. + node: typing.Type + + # node construction args. + kwargs: typing.Dict[str, typing.Any] + + def __init__( + self, + node: typing.Type, + **kwargs, + ): + self.node = node + self.kwargs = kwargs + + def __repr__(self) -> str: + return f'{typename(self)}({self.node.__name__}, {self.kwargs})' + + def __hash__(self) -> int: + kwargs = tuple((key, self.kwargs[key]) for key in sorted(self.kwargs)) + return hash((super().__hash__(), self.node, kwargs)) + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) \ + and self.node == other.node \ + and self.kwargs == other.kwargs + + def match( + self, + name: str, + value: typing.Any, + ) -> bool: + """Return True if *name* is unspecified or matches *value*.""" + return name not in self.kwargs or self.kwargs[name] == value + + +T_ITEM_TYPE = typing.TypeVar('T_ITEM_TYPE') # pylint: disable=invalid-name + +def _set_matcher( + query: typing.Collection[T_ITEM_TYPE], + reference: typing.Collection[T_ITEM_TYPE], + cmp: typing.Callable[[T_ITEM_TYPE, T_ITEM_TYPE], bool], + ) -> bool: + """Compare two sets of child expressions. + + This check has a best-case complexity of O(|N|**2) and worst-case + complexity of O(|N|**3), with N the number of child expressions. + """ + # get reference items + r_items = list(reference) + # deal with Rest + r_rest = {itm for itm in r_items if isinstance(itm, Rest)} + if len(r_rest) > 1: + raise errors.BackendError(f'there must be at most one Rest instance per set, found {len(r_rest)}') + if len(r_rest) == 1: + # replace Rest by filling the reference up with rest's expression + # NOTE: convert r_items to list so that items can be repeated + expr = next(iter(r_rest)).expr # type: ignore [attr-defined] + r_items = [itm for itm in r_items if not isinstance(itm, Rest)] + r_items += [expr for _ in range(len(query) - len(r_items))] # type: ignore [misc] + # sanity check: cannot match if the item sizes differ: + # either a reference item is unmatched (len(r_items) > len(query)) + # or a query item is unmatched (len(r_items) < len(query)) + if len(query) != len(r_items): + return False + + # To have a positive match between the query and the reference, + # each query expr has to match any reference expr. + # However, each reference expr can only be "consumed" once even + # if it matches multiple query exprs (e.g., the Any expression matches + # every query expr). + # This is a bipartide matching problem (Hall's marriage problem) + # and the Hopcroft-Karp-Karzanov algorithm finds a maximum + # matching. While there might be multiple maximum matchings, + # we only need to know whether (at least) one complete matching + # exists. The hopcroftkarp module provides this functionality. + # The HKK algorithm has worst-case complexity of O(|N|**2 * sqrt(|N|)) + # and we also need to compare expressions pairwise, hence O(|N|**2). + num_items = len(r_items) + graph = defaultdict(set) + # build the bipartide graph as {lhs: {rhs}, ...} + # lhs and rhs must be disjoint identifiers. + for (ridx, ref), (nidx, node) in product(enumerate(r_items), enumerate(query)): + # add edges for equal expressions + if cmp(node, ref): + graph[ridx].add(num_items + nidx) + + # maximum_matching returns the matches for all nodes in the graph + # ({ref_itm: node_itm}), hence a complete matching's size is + # the number of reference's child expressions. + return len(HopcroftKarp(graph).maximum_matching(keys_only=True)) == num_items + + +class Filter(): + """Compare a bsfs.query.ast.filter` query's structure to a reference ast. + + The reference ast may include `Rest`, `Partial`, or `Any` to account for irrelevant + or unknown ast pieces. + + This is only a structural comparison, not a semantic one. For example, the + two following queries are semantically identical, but structurally different, + and would therefore not match: + + >>> ast.filter.OneOf(ast.filter.Predicate(ns.bse.name)) + >>> ast.filter.Predicate(ns.bse.name) + + """ + + def __call__(self, query: ast.filter.FilterExpression, reference: ast.filter.FilterExpression) -> bool: + """Compare a *query* to a *reference* ast structure. + Return True if both are structurally equivalent. + """ + if not isinstance(query, ast.filter.FilterExpression): + raise errors.BackendError(f'expected filter expression, found {query}') + if not isinstance(reference, ast.filter.FilterExpression): + raise errors.BackendError(f'expected filter expression, found {reference}') + return self._parse_filter_expression(query, reference) + + def _parse_filter_expression( + self, + node: ast.filter.FilterExpression, + reference: ast.filter.FilterExpression, + ) -> bool: + """Route *node* to the handler of the respective FilterExpression subclass.""" + # generic checks: reference type must be Any or match node type + if isinstance(reference, Any): + return True + # node-specific checks + if isinstance(node, ast.filter.Not): + return self._not(node, reference) + if isinstance(node, ast.filter.Has): + return self._has(node, reference) + if isinstance(node, ast.filter.Distance): + return self._distance(node, reference) + if isinstance(node, (ast.filter.Any, ast.filter.All)): + return self._branch(node, reference) + if isinstance(node, (ast.filter.And, ast.filter.Or)): + return self._agg(node, reference) + if isinstance(node, (ast.filter.Is, ast.filter.Equals, ast.filter.Substring, + ast.filter.StartsWith, ast.filter.EndsWith)): + return self._value(node, reference) + if isinstance(node, (ast.filter.LessThan, ast.filter.GreaterThan)): + return self._bounded(node, reference) + # invalid node + raise errors.BackendError(f'expected filter expression, found {node}') + + def _parse_predicate_expression( + self, + node: ast.filter.PredicateExpression, + reference: ast.filter.PredicateExpression, + ) -> bool: + """Route *node* to the handler of the respective PredicateExpression subclass.""" + if isinstance(reference, Any): + return True + if isinstance(node, ast.filter.Predicate): + return self._predicate(node, reference) + if isinstance(node, ast.filter.OneOf): + return self._one_of(node, reference) + # invalid node + raise errors.BackendError(f'expected predicate expression, found {node}') + + def _one_of(self, node: ast.filter.OneOf, reference: ast.filter.PredicateExpression) -> bool: + if not isinstance(reference, type(node)): + return False + return _set_matcher(node, reference, self._parse_predicate_expression) + + def _predicate(self, node: ast.filter.Predicate, reference: ast.filter.PredicateExpression) -> bool: + if not isinstance(reference, (Partial, type(node))): + return False + # partial check + if isinstance(reference, Partial): + if not isinstance(node, reference.node): + return False + return reference.match('predicate', node.predicate) \ + and reference.match('reverse', node.reverse) + # full check + return node.predicate == reference.predicate \ + and node.reverse == reference.reverse + + def _branch(self, + node: typing.Union[ast.filter.Any, ast.filter.All], + reference: ast.filter.FilterExpression, + ) -> bool: + if not isinstance(reference, type(node)): + return False + if not self._parse_predicate_expression(node.predicate, reference.predicate): # type: ignore [attr-defined] + return False + if not self._parse_filter_expression(node.expr, reference.expr): # type: ignore [attr-defined] + return False + return True + + def _agg(self, node: typing.Union[ast.filter.And, ast.filter.Or], reference: ast.filter.FilterExpression) -> bool: + if not isinstance(reference, type(node)): + return False + return _set_matcher(node, reference, self._parse_filter_expression) # type: ignore [arg-type] + + def _not(self, node: ast.filter.Not, reference: ast.filter.FilterExpression) -> bool: + if not isinstance(reference, type(node)): + return False + return self._parse_filter_expression(node.expr, reference.expr) + + def _has(self, node: ast.filter.Has, reference: ast.filter.FilterExpression) -> bool: + if not isinstance(reference, type(node)): + return False + return self._parse_predicate_expression(node.predicate, reference.predicate) \ + and self._parse_filter_expression(node.count, reference.count) + + def _distance(self, node: ast.filter.Distance, reference: ast.filter.FilterExpression) -> bool: + if not isinstance(reference, (Partial, type(node))): + return False + # partial check + if isinstance(reference, Partial): + if not isinstance(node, reference.node): + return False + return reference.match('reference', node.reference) \ + and reference.match('threshold', node.threshold) \ + and reference.match('strict', node.strict) + # full check + return node.reference == reference.reference \ + and node.threshold == reference.threshold \ + and node.strict == reference.strict + + def _value(self, node: ast.filter._Value, reference: ast.filter.FilterExpression) -> bool: + if not isinstance(reference, (Partial, type(node))): + return False + # partial check + if isinstance(reference, Partial): + if not isinstance(node, reference.node): + return False + return reference.match('value', node.value) + # full ckeck + return node.value == reference.value + + def _bounded(self, node: ast.filter._Bounded, reference: ast.filter.FilterExpression) -> bool: + if not isinstance(reference, (Partial, type(node))): + return False + # partial check + if isinstance(reference, Partial): + if not isinstance(node, reference.node): + return False + return reference.match('threshold', node.threshold) \ + and reference.match('strict', node.strict) + # full check + return node.threshold == reference.threshold \ + and node.strict == reference.strict + +## EOF ## diff --git a/bsfs/query/validator.py b/bsfs/query/validator.py new file mode 100644 index 0000000..10ca492 --- /dev/null +++ b/bsfs/query/validator.py @@ -0,0 +1,351 @@ + +# imports +import typing + +# bsfs imports +from bsfs import schema as bsc +from bsfs.namespace import ns +from bsfs.utils import errors, typename + +# inner-module imports +from . import ast + +# exports +__all__ : typing.Sequence[str] = ( + 'Filter', + ) + +# FIXME: Split into a submodule and the two classes into their own respective files. + +## code ## + +class Filter(): + """Validate a `bsfs.query.ast.filter` query's structure and schema compliance. + + * Conditions (Bounded, Value) can only be applied on literals + * Branches, Id, and Has can only be applied on nodes + * Predicates' domain and range must match + * Predicate paths must follow the schema + * Referenced types are present in the schema + + """ + + # schema to validate against. + schema: bsc.Schema + + def __init__(self, schema: bsc.Schema): + self.schema = schema + + def __call__(self, root_type: bsc.Node, query: ast.filter.FilterExpression) -> bool: + """Alias for `Filter.validate`.""" + return self.validate(root_type, query) + + def validate(self, root_type: bsc.Node, query: ast.filter.FilterExpression) -> bool: + """Validate a filter *query*, assuming the subject having *root_type*. + + Raises a `bsfs.utils.errors.ConsistencyError` if the query violates the schema. + Raises a `bsfs.utils.errors.BackendError` if the query structure is invalid. + + """ + # root_type must be a schema.Node + if not isinstance(root_type, bsc.Node): + raise TypeError(f'expected a node, found {typename(root_type)}') + # root_type must exist in the schema + if root_type not in self.schema.nodes(): + raise errors.ConsistencyError(f'{root_type} is not defined in the schema') + # check root expression + self._parse_filter_expression(root_type, query) + # all tests passed + return True + + + ## routing methods + + def _parse_filter_expression(self, type_: bsc.Vertex, node: ast.filter.FilterExpression): + """Route *node* to the handler of the respective FilterExpression subclass.""" + if isinstance(node, ast.filter.Is): + return self._is(type_, node) + if isinstance(node, ast.filter.Not): + return self._not(type_, node) + if isinstance(node, ast.filter.Has): + return self._has(type_, node) + if isinstance(node, ast.filter.Distance): + return self._distance(type_, node) + if isinstance(node, (ast.filter.Any, ast.filter.All)): + return self._branch(type_, node) + if isinstance(node, (ast.filter.And, ast.filter.Or)): + return self._agg(type_, node) + if isinstance(node, (ast.filter.Equals, ast.filter.Substring, ast.filter.StartsWith, ast.filter.EndsWith)): + return self._value(type_, node) + if isinstance(node, (ast.filter.LessThan, ast.filter.GreaterThan)): + return self._bounded(type_, node) + # invalid node + raise errors.BackendError(f'expected filter expression, found {node}') + + def _parse_predicate_expression(self, node: ast.filter.PredicateExpression) -> typing.Tuple[bsc.Vertex, bsc.Vertex]: + """Route *node* to the handler of the respective PredicateExpression subclass.""" + if isinstance(node, ast.filter.Predicate): + return self._predicate(node) + if isinstance(node, ast.filter.OneOf): + return self._one_of(node) + # invalid node + raise errors.BackendError(f'expected predicate expression, found {node}') + + + ## predicate expressions + + def _predicate(self, node: ast.filter.Predicate) -> typing.Tuple[bsc.Vertex, bsc.Vertex]: + # predicate exists in the schema + if not self.schema.has_predicate(node.predicate): + raise errors.ConsistencyError(f'predicate {node.predicate} is not in the schema') + # determine domain and range + pred = self.schema.predicate(node.predicate) + if not isinstance(pred.range, (bsc.Node, bsc.Literal)): + raise errors.BackendError(f'the range of predicate {pred} is undefined') + dom, rng = pred.domain, pred.range + if node.reverse: + dom, rng = rng, dom # type: ignore [assignment] # variable re-use confuses mypy + # return domain and range + return dom, rng + + def _one_of(self, node: ast.filter.OneOf) -> typing.Tuple[bsc.Vertex, bsc.Vertex]: + # determine domain and range types + # NOTE: select the most specific domain and the most generic range + dom, rng = None, None + for pred in node: + # parse child expression + subdom, subrng = self._parse_predicate_expression(pred) + # determine overall domain + if dom is None or subdom < dom: # pick most specific domain + dom = subdom + # domains must be related across all child expressions + if not subdom <= dom and not subdom >= dom: + raise errors.ConsistencyError(f'domains {subdom} and {dom} are not related') + # determine overall range + if rng is None or subrng > rng: # pick most generic range + rng = subrng + # ranges must be related across all child expressions + if not subrng <= rng and not subrng >= rng: + raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related') + # OneOf guarantees at least one expression, dom and rng are always bsc.Vertex. + # mypy does not realize this, hence we ignore the warning. + return dom, rng # type: ignore [return-value] + + + ## intermediates + + def _branch(self, type_: bsc.Vertex, node: ast.filter._Branch): + # type is a Node + if not isinstance(type_, bsc.Node): + raise errors.ConsistencyError(f'expected a Node, found {type_}') + # type exists in the schema + # FIXME: Isn't it actually guaranteed that the type (except the root type) is part of the schema? + # all types can be traced back to (a) root_type, (b) predicate, or (c) manually set (e.g. in _is). + # For (a), we do (and have to) perform a check. For (c), the code base should be consistent throughout + # the module, so this is an assumption that has to be ensured in schema.Schema. For (b), we know (and + # check) that the predicate is in the schema, hence all node/literals derived from it are also in the + # schema by construction of the schema.Schema class. So, why do we check this every time? + if type_ not in self.schema.nodes(): + raise errors.ConsistencyError(f'node {type_} is not in the schema') + # predicate is valid + dom, rng = self._parse_predicate_expression(node.predicate) + # type_ is a subtype of the predicate's domain + if not type_ <= dom: + raise errors.ConsistencyError(f'expected type {dom} or subtype thereof, found {type_}') + # child expression is valid + self._parse_filter_expression(rng, node.expr) + + def _agg(self, type_: bsc.Vertex, node: ast.filter._Agg): + for expr in node: + # child expression is valid + self._parse_filter_expression(type_, expr) + + def _not(self, type_: bsc.Vertex, node: ast.filter.Not): + # child expression is valid + self._parse_filter_expression(type_, node.expr) + + def _has(self, type_: bsc.Vertex, node: ast.filter.Has): + # type is a Node + if not isinstance(type_, bsc.Node): + raise errors.ConsistencyError(f'expected a Node, found {type_}') + # type exists in the schema + if type_ not in self.schema.nodes(): + raise errors.ConsistencyError(f'node {type_} is not in the schema') + # predicate is valid + dom, _= self._parse_predicate_expression(node.predicate) + # type_ is a subtype of the predicate's domain + if not type_ <= dom: + raise errors.ConsistencyError(f'expected type {dom}, found {type_}') + # node.count is a numerical expression + self._parse_filter_expression(self.schema.literal(ns.bsl.Number), node.count) + + def _distance(self, type_: bsc.Vertex, node: ast.filter.Distance): + # type is a Literal + if not isinstance(type_, bsc.Feature): + raise errors.ConsistencyError(f'expected a Feature, found {type_}') + # type exists in the schema + if type_ not in self.schema.literals(): + raise errors.ConsistencyError(f'literal {type_} is not in the schema') + # reference matches type_ + if len(node.reference) != type_.dimension: + raise errors.ConsistencyError(f'reference has dimension {len(node.reference)}, expected {type_.dimension}') + # FIXME: test dtype + + + ## conditions + + def _is(self, type_: bsc.Vertex, node: ast.filter.Is): # pylint: disable=unused-argument # (node) + if not isinstance(type_, bsc.Node): + raise errors.ConsistencyError(f'expected a Node, found {type_}') + if type_ not in self.schema.nodes(): + raise errors.ConsistencyError(f'node {type_} is not in the schema') + + def _value(self, type_: bsc.Vertex, node: ast.filter._Value): # pylint: disable=unused-argument # (node) + # type is a literal + if not isinstance(type_, bsc.Literal): + raise errors.ConsistencyError(f'expected a Literal, found {type_}') + # type exists in the schema + if type_ not in self.schema.literals(): + raise errors.ConsistencyError(f'literal {type_} is not in the schema') + # FIXME: Check if node.value corresponds to type_ + # FIXME: A specific literal might be requested (i.e., a numeric type when used in Has) + + def _bounded(self, type_: bsc.Vertex, node: ast.filter._Bounded): # pylint: disable=unused-argument # (node) + # type is a literal + if not isinstance(type_, bsc.Literal): + raise errors.ConsistencyError(f'expected a Literal, found {type_}') + # type exists in the schema + if type_ not in self.schema.literals(): + raise errors.ConsistencyError(f'literal {type_} is not in the schema') + # type must be a numerical + if not type_ <= self.schema.literal(ns.bsl.Number): + raise errors.ConsistencyError(f'expected a number type, found {type_}') + # FIXME: Check if node.value corresponds to type_ + + +class Fetch(): + """Validate a `bsfs.query.ast.fetch` query's structure and schema compliance. + + * Value can only be applied on literals + * Node can only be applied on nodes + * Names must be non-empty + * Branching nodes' predicates must match the type + * Symbols must be in the schema + * Predicates must follow the schema + + """ + + # schema to validate against. + schema: bsc.Schema + + def __init__(self, schema: bsc.Schema): + self.schema = schema + + def __call__(self, root_type: bsc.Node, query: ast.fetch.FetchExpression) -> bool: + """Alias for `Fetch.validate`.""" + return self.validate(root_type, query) + + def validate(self, root_type: bsc.Node, query: ast.fetch.FetchExpression) -> bool: + """Validate a fetch *query*, assuming the subject having *root_type*. + + Raises a `bsfs.utils.errors.ConsistencyError` if the query violates the schema. + Raises a `bsfs.utils.errors.BackendError` if the query structure is invalid. + + """ + # root_type must be a schema.Node + if not isinstance(root_type, bsc.Node): + raise TypeError(f'expected a node, found {typename(root_type)}') + # root_type must exist in the schema + if root_type not in self.schema.nodes(): + raise errors.ConsistencyError(f'{root_type} is not defined in the schema') + # query must be a FetchExpression + if not isinstance(query, ast.fetch.FetchExpression): + raise TypeError(f'expected a fetch expression, found {typename(query)}') + # check root expression + self._parse_fetch_expression(root_type, query) + # all tests passed + return True + + def _parse_fetch_expression(self, type_: bsc.Vertex, node: ast.fetch.FetchExpression): + """Route *node* to the handler of the respective FetchExpression subclass.""" + if isinstance(node, (ast.fetch.Fetch, ast.fetch.Value, ast.fetch.Node)): + # NOTE: don't return so that checks below are executed + self._branch(type_, node) + if isinstance(node, (ast.fetch.Value, ast.fetch.Node)): + # NOTE: don't return so that checks below are executed + self._named(type_, node) + if isinstance(node, ast.fetch.All): + return self._all(type_, node) + if isinstance(node, ast.fetch.Fetch): + return self._fetch(type_, node) + if isinstance(node, ast.fetch.Value): + return self._value(type_, node) + if isinstance(node, ast.fetch.Node): + return self._node(type_, node) + if isinstance(node, ast.fetch.This): + return self._this(type_, node) + # invalid node + raise errors.BackendError(f'expected fetch expression, found {node}') + + def _all(self, type_: bsc.Vertex, node: ast.fetch.All): + # check child expressions + for expr in node: + self._parse_fetch_expression(type_, expr) + + def _branch(self, type_: bsc.Vertex, node: ast.fetch._Branch): + # type is a node + if not isinstance(type_, bsc.Node): + raise errors.ConsistencyError(f'expected a Node, found {type_}') + # node exists in the schema + if type_ not in self.schema.nodes(): + raise errors.ConsistencyError(f'node {type_} is not in the schema') + # predicate exists in the schema + if not self.schema.has_predicate(node.predicate): + raise errors.ConsistencyError(f'predicate {node.predicate} is not in the schema') + pred = self.schema.predicate(node.predicate) + # type_ must be a subclass of domain + if not type_ <= pred.domain: + raise errors.ConsistencyError( + f'expected type {pred.domain} or subtype thereof, found {type_}') + + def _fetch(self, type_: bsc.Vertex, node: ast.fetch.Fetch): # pylint: disable=unused-argument # type_ was considered in _branch + # range must be a node + rng = self.schema.predicate(node.predicate).range + if not isinstance(rng, bsc.Node): + raise errors.ConsistencyError( + f'expected the predicate\'s range to be a Node, found {rng}') + # child expression must be valid + self._parse_fetch_expression(rng, node.expr) + + def _named(self, type_: bsc.Vertex, node: ast.fetch._Named): # pylint: disable=unused-argument # type_ was considered in _branch + # name must be set + if node.name.strip() == '': + raise errors.BackendError('node name cannot be empty') + # FIXME: check for double name use? + + def _node(self, type_: bsc.Vertex, node: ast.fetch.Node): # pylint: disable=unused-argument # type_ was considered in _branch + # range must be a node + rng = self.schema.predicate(node.predicate).range + if not isinstance(rng, bsc.Node): + raise errors.ConsistencyError( + f'expected the predicate\'s range to be a Node, found {rng}') + + def _value(self, type_: bsc.Vertex, node: ast.fetch.Value): # pylint: disable=unused-argument # type_ was considered in _branch + # range must be a literal + rng = self.schema.predicate(node.predicate).range + if not isinstance(rng, bsc.Literal): + raise errors.ConsistencyError( + f'expected the predicate\'s range to be a Literal, found {rng}') + + def _this(self, type_: bsc.Vertex, node: ast.fetch.This): + # type is a node + if not isinstance(type_, bsc.Node): + raise errors.ConsistencyError(f'expected a Node, found {type_}') + # node exists in the schema + if type_ not in self.schema.nodes(): + raise errors.ConsistencyError(f'node {type_} is not in the schema') + # name must be set + if node.name.strip() == '': + raise errors.BackendError('node name cannot be empty') + +## EOF ## diff --git a/bsfs/schema/__init__.py b/bsfs/schema/__init__.py index ad4d456..ca2e0cd 100644 --- a/bsfs/schema/__init__.py +++ b/bsfs/schema/__init__.py @@ -1,15 +1,15 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import typing # inner-module imports from .schema import Schema -from .types import Literal, Node, Predicate +from .serialize import from_string, to_string +from .types import Literal, Node, Predicate, Vertex, Feature, \ + ROOT_VERTEX, ROOT_NODE, ROOT_LITERAL, \ + ROOT_NUMBER, ROOT_TIME, \ + ROOT_ARRAY, ROOT_FEATURE, \ + ROOT_PREDICATE # exports __all__: typing.Sequence[str] = ( @@ -17,6 +17,8 @@ __all__: typing.Sequence[str] = ( 'Node', 'Predicate', 'Schema', + 'from_string', + 'to_string', ) ## EOF ## diff --git a/bsfs/schema/schema.py b/bsfs/schema/schema.py index c5d4571..c104436 100644 --- a/bsfs/schema/schema.py +++ b/bsfs/schema/schema.py @@ -1,16 +1,9 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports from collections import abc, namedtuple import typing -import rdflib # bsfs imports -from bsfs.namespace import ns from bsfs.utils import errors, URI, typename # inner-module imports @@ -51,11 +44,13 @@ class Schema(): def __init__( self, - predicates: typing.Iterable[types.Predicate], + predicates: typing.Optional[typing.Iterable[types.Predicate]] = None, nodes: typing.Optional[typing.Iterable[types.Node]] = None, literals: typing.Optional[typing.Iterable[types.Literal]] = None, ): # materialize arguments + if predicates is None: + predicates = set() if nodes is None: nodes = set() if literals is None: @@ -63,24 +58,41 @@ class Schema(): nodes = set(nodes) literals = set(literals) predicates = set(predicates) + + # add root types to the schema + nodes.add(types.ROOT_NODE) + literals.add(types.ROOT_LITERAL) + predicates.add(types.ROOT_PREDICATE) + # add minimally necessary types to the schema + literals.add(types.ROOT_BLOB) + literals.add(types.ROOT_NUMBER) + literals.add(types.ROOT_TIME) + literals.add(types.ROOT_ARRAY) + literals.add(types.ROOT_FEATURE) + + # FIXME: ensure that types derive from the right root? + # include parents in predicates set # TODO: review type annotations and ignores for python >= 3.11 (parents is _Type but should be typing.Self) predicates |= {par for pred in predicates for par in pred.parents()} # type: ignore [misc] # include predicate domain in nodes set nodes |= {pred.domain for pred in predicates} # include predicate range in nodes and literals sets - prange = {pred.range for pred in predicates if pred.range is not None} + prange = {pred.range for pred in predicates} nodes |= {vert for vert in prange if isinstance(vert, types.Node)} literals |= {vert for vert in prange if isinstance(vert, types.Literal)} + # NOTE: ROOT_PREDICATE has a Vertex as range which is neither in nodes nor literals + # FIXME: with the ROOT_VERTEX missing, the schema is not complete anymore! + # include parents in nodes and literals sets - # NOTE: Must be done after predicate domain/range was handled - # so that their parents are included as well. + # NOTE: Must come after predicate domain/range was handled to have their parents as well. nodes |= {par for node in nodes for par in node.parents()} # type: ignore [misc] literals |= {par for lit in literals for par in lit.parents()} # type: ignore [misc] # assign members self._nodes = {node.uri: node for node in nodes} self._literals = {lit.uri: lit for lit in literals} self._predicates = {pred.uri: pred for pred in predicates} + # verify unique uris if len(nodes) != len(self._nodes): raise errors.ConsistencyError('inconsistent nodes') @@ -214,6 +226,7 @@ class Schema(): >>> Schema.Union([a, b, c]) """ + # FIXME: copy type annotations? if len(args) == 0: raise TypeError('Schema.Union requires at least one argument (Schema or Iterable)') if isinstance(args[0], cls): # args is sequence of Schema instances @@ -295,92 +308,8 @@ class Schema(): """Return the Literal matching the *uri*.""" return self._literals[uri] - - ## constructors ## - - - @classmethod - def Empty(cls) -> 'Schema': # pylint: disable=invalid-name # capitalized classmethod - """Return a minimal Schema.""" - node = types.Node(ns.bsfs.Node, None) - literal = types.Literal(ns.bsfs.Literal, None) - predicate = types.Predicate( - uri=ns.bsfs.Predicate, - parent=None, - domain=node, - range=None, - unique=False, - ) - return cls((predicate, ), (node, ), (literal, )) - - - @classmethod - def from_string(cls, schema: str) -> 'Schema': # pylint: disable=invalid-name # capitalized classmethod - """Load and return a Schema from a string.""" - # parse string into rdf graph - graph = rdflib.Graph() - graph.parse(data=schema, format='turtle') - - def _fetch_hierarchically(factory, curr): - # emit current node - yield curr - # walk through childs - for child in graph.subjects(rdflib.URIRef(ns.rdfs.subClassOf), rdflib.URIRef(curr.uri)): - # convert to URI - child = URI(child) - # check circular dependency - if child == curr.uri or child in {node.uri for node in curr.parents()}: - raise errors.ConsistencyError('circular dependency') - # recurse and emit (sub*)childs - yield from _fetch_hierarchically(factory, factory(child, curr)) - - # fetch nodes - nodes = set(_fetch_hierarchically(types.Node, types.Node(ns.bsfs.Node, None))) - nodes_lut = {node.uri: node for node in nodes} - if len(nodes_lut) != len(nodes): - raise errors.ConsistencyError('inconsistent nodes') - - # fetch literals - literals = set(_fetch_hierarchically(types.Literal, types.Literal(ns.bsfs.Literal, None))) - literals_lut = {lit.uri: lit for lit in literals} - if len(literals_lut) != len(literals): - raise errors.ConsistencyError('inconsistent literals') - - # fetch predicates - def build_predicate(uri, parent): - uri = rdflib.URIRef(uri) - # get domain - domains = set(graph.objects(uri, rdflib.RDFS.domain)) - if len(domains) != 1: - raise errors.ConsistencyError(f'inconsistent domain: {domains}') - dom = nodes_lut.get(next(iter(domains))) - if dom is None: - raise errors.ConsistencyError('missing domain') - # get range - ranges = set(graph.objects(uri, rdflib.RDFS.range)) - if len(ranges) != 1: - raise errors.ConsistencyError(f'inconsistent range: {ranges}') - rng = next(iter(ranges)) - rng = nodes_lut.get(rng, literals_lut.get(rng)) - if rng is None: - raise errors.ConsistencyError('missing range') - # get unique flag - uniques = set(graph.objects(uri, rdflib.URIRef(ns.bsfs.unique))) - if len(uniques) != 1: - raise errors.ConsistencyError(f'inconsistent unique flags: {uniques}') - unique = bool(next(iter(uniques))) - # build Predicate - return types.Predicate(URI(uri), parent, dom, rng, unique) - - root_predicate = types.Predicate( - uri=ns.bsfs.Predicate, - parent=None, - domain=nodes_lut[ns.bsfs.Node], - range=None, # FIXME: Unclear how to handle this! Can be either a Literal or a Node - unique=False, - ) - predicates = _fetch_hierarchically(build_predicate, root_predicate) - # return Schema - return cls(predicates, nodes, literals) + def predicates_at(self, node: types.Node) -> typing.Iterator[types.Predicate]: + """Return predicates that have domain *node* (or superclass thereof).""" + return iter(pred for pred in self._predicates.values() if node <= pred.domain) ## EOF ## diff --git a/bsfs/schema/serialize.py b/bsfs/schema/serialize.py new file mode 100644 index 0000000..ea8b2f4 --- /dev/null +++ b/bsfs/schema/serialize.py @@ -0,0 +1,255 @@ + +# standard imports +import itertools +import typing + +# external imports +import rdflib + +# bsfs imports +from bsfs.namespace import ns +from bsfs.utils import errors, URI, typename + +# inner-module imports +from . import types +from . import schema + +# exports +__all__: typing.Sequence[str] = ( + 'to_string', + 'from_string', + ) + + +## code ## + +def from_string(schema_str: str) -> schema.Schema: + """Load and return a Schema from a string.""" + # parse string into rdf graph + graph = rdflib.Graph() + graph.parse(data=schema_str, format='turtle') + + # helper functions + def _fetch_value( + subject: URI, + predicate: rdflib.URIRef, + value_factory: typing.Callable[[typing.Any], typing.Any], + ) -> typing.Optional[typing.Any]: + """Fetch the object of a given subject and predicate. + Raises a `errors.ConsistencyError` if multiple objects match. + """ + values = list(graph.objects(rdflib.URIRef(subject), predicate)) + if len(values) == 0: + return None + if len(values) == 1: + return value_factory(values[0]) + raise errors.ConsistencyError( + f'{subject} has multiple values for predicate {str(predicate)}, expected zero or one') + + def _convert(value): + """Convert the subject type from rdflib to a bsfs native type.""" + if isinstance(value, rdflib.Literal): + return value.value + if isinstance(value, rdflib.URIRef): + return URI(value) + # value is neither a node nor a literal, but e.g. a blank node + raise errors.BackendError(f'expected Literal or URIRef, found {typename(value)}') + + def _fetch_hierarchically(factory, curr): + """Walk through a rdfs:subClassOf hierarchy, creating symbols along the way.""" + # emit current node + yield curr + # walk through childs + for child in graph.subjects(rdflib.URIRef(ns.rdfs.subClassOf), rdflib.URIRef(curr.uri)): + # fetch annotations + annotations = { + URI(pred): _convert(value) + for pred, value # FIXME: preserve datatype of value?! + in graph.predicate_objects(child) + if URI(pred) != ns.rdfs.subClassOf + } + # convert child to URI + child = URI(child) + # check circular dependency + if child == curr.uri or child in {node.uri for node in curr.parents()}: + raise errors.ConsistencyError('circular dependency') + # recurse and emit (sub*)childs + yield from _fetch_hierarchically(factory, factory(child, curr, **annotations)) + + # fetch nodes + nodes = set(_fetch_hierarchically(types.Node, types.ROOT_NODE)) + nodes_lut = {node.uri: node for node in nodes} + if len(nodes_lut) != len(nodes): + raise errors.ConsistencyError('inconsistent nodes') + + # fetch literals + def _build_literal(uri, parent, **annotations): + """Literal factory.""" + # break out on root feature type + if uri == types.ROOT_FEATURE.uri: + return types.ROOT_FEATURE + # handle feature types + if isinstance(parent, types.Feature): + # clean annotations + annotations.pop(ns.bsfs.dimension, None) + annotations.pop(ns.bsfs.dtype, None) + annotations.pop(ns.bsfs.distance, None) + # get dimension + dimension = _fetch_value(uri, rdflib.URIRef(ns.bsfs.dimension), int) + # get dtype + dtype = _fetch_value(uri, rdflib.URIRef(ns.bsfs.dtype), URI) + # get distance + distance = _fetch_value(uri, rdflib.URIRef(ns.bsfs.distance), URI) + # return feature + return parent.child(URI(uri), dtype=dtype, dimension=dimension, distance=distance, **annotations) + # handle non-feature types + return parent.child(URI(uri), **annotations) + + literals = set(_fetch_hierarchically(_build_literal, types.ROOT_LITERAL)) + literals_lut = {lit.uri: lit for lit in literals} + if len(literals_lut) != len(literals): + raise errors.ConsistencyError('inconsistent literals') + + # fetch predicates + def _build_predicate(uri, parent, **annotations): + """Predicate factory.""" + # clean annotations + annotations.pop(ns.rdfs.domain, None) + annotations.pop(ns.rdfs.range, None) + annotations.pop(ns.bsfs.unique, None) + # get domain + dom = _fetch_value(uri, rdflib.RDFS.domain, URI) + if dom is not None and dom not in nodes_lut: + raise errors.ConsistencyError(f'predicate {uri} has undefined domain {dom}') + if dom is not None: + dom = nodes_lut[dom] + # get range + rng = _fetch_value(uri, rdflib.RDFS.range, URI) + if rng is not None and rng not in nodes_lut and rng not in literals_lut: + raise errors.ConsistencyError(f'predicate {uri} has undefined range {rng}') + if rng is not None: + rng = nodes_lut.get(rng, literals_lut.get(rng)) + # get unique + unique = _fetch_value(uri, rdflib.URIRef(ns.bsfs.unique), bool) + # build predicate + return parent.child(URI(uri), domain=dom, range=rng, unique=unique, **annotations) + + predicates = _fetch_hierarchically(_build_predicate, types.ROOT_PREDICATE) + + return schema.Schema(predicates, nodes, literals) + + + +def to_string(schema_inst: schema.Schema, fmt: str = 'turtle') -> str: + """Serialize a `bsfs.schema.Schema` to a string. + See `rdflib.Graph.serialize` for viable formats (default: turtle). + """ + + # type of emitted triples. + T_TRIPLE = typing.Iterator[typing.Tuple[rdflib.URIRef, rdflib.URIRef, rdflib.term.Identifier]] + + def _type(tpe: types._Type) -> T_TRIPLE : + """Emit _Type properties (parent, annotations).""" + # emit parent + if tpe.parent is not None: + yield ( + rdflib.URIRef(tpe.uri), + rdflib.URIRef(ns.rdfs.subClassOf), + rdflib.URIRef(tpe.parent.uri), + ) + # emit annotations + for prop, value in tpe.annotations.items(): + yield ( + rdflib.URIRef(tpe.uri), + rdflib.URIRef(prop), + rdflib.Literal(value), # FIXME: datatype?! + ) + + def _predicate(pred: types.Predicate) -> T_TRIPLE: + """Emit Predicate properties (domain, range, unique).""" + # no need to emit anything for the root predicate + if pred == types.ROOT_PREDICATE: + return + # emit domain + if pred.domain != getattr(pred.parent, 'domain', None): + yield ( + rdflib.URIRef(pred.uri), + rdflib.URIRef(ns.rdfs.domain), + rdflib.URIRef(pred.domain.uri), + ) + # emit range + if pred.range != getattr(pred.parent, 'range', None): + yield ( + rdflib.URIRef(pred.uri), + rdflib.URIRef(ns.rdfs.range), + rdflib.URIRef(pred.range.uri), + ) + # emit cardinality + if pred.unique != getattr(pred.parent, 'unique', None): + yield ( + rdflib.URIRef(pred.uri), + rdflib.URIRef(ns.bsfs.unique), + rdflib.Literal(pred.unique, datatype=rdflib.XSD.boolean), + ) + + def _feature(feat: types.Feature) -> T_TRIPLE: + """Emit Feature properties (dimension, dtype, distance).""" + # emit size + if feat.dimension != getattr(feat.parent, 'dimension', None): + yield ( + rdflib.URIRef(feat.uri), + rdflib.URIRef(ns.bsfs.dimension), + rdflib.Literal(feat.dimension, datatype=rdflib.XSD.integer), + ) + # emit dtype + if feat.dtype != getattr(feat.parent, 'dtype', None): + yield ( + rdflib.URIRef(feat.uri), + rdflib.URIRef(ns.bsfs.dtype), + rdflib.URIRef(feat.dtype), + ) + # emit distance + if feat.distance != getattr(feat.parent, 'distance', None): + yield ( + rdflib.URIRef(feat.uri), + rdflib.URIRef(ns.bsfs.distance), + rdflib.URIRef(feat.distance), + ) + + def _parse(node: types._Type) -> T_TRIPLE: + """Emit all properties of a type.""" + # check arg + if not isinstance(node, types._Type): # pylint: disable=protected-access + raise TypeError(node) + # emit _Type essentials + yield from _type(node) + # emit properties of derived types + if isinstance(node, types.Predicate): + yield from _predicate(node) + if isinstance(node, types.Feature): + yield from _feature(node) + + # create graph + graph = rdflib.Graph() + # add triples to graph + nodes = itertools.chain( + schema_inst.nodes(), + schema_inst.literals(), + schema_inst.predicates()) + for node in nodes: + for triple in _parse(node): + graph.add(triple) + # add known namespaces for readability + # FIXME: more generically? + graph.bind('bsfs', rdflib.URIRef(ns.bsfs + '/')) + graph.bind('bsl', rdflib.URIRef(ns.bsl + '/')) + graph.bind('bsn', rdflib.URIRef(ns.bsn + '#')) + graph.bind('bse', rdflib.URIRef(ns.bsfs.Entity() + '#')) + graph.bind('rdf', rdflib.URIRef(ns.rdf)) + graph.bind('rdfs', rdflib.URIRef(ns.rdfs)) + graph.bind('schema', rdflib.URIRef(ns.schema)) + graph.bind('xsd', rdflib.URIRef(ns.xsd)) + # serialize to turtle + return graph.serialize(format=fmt) + +## EOF ## diff --git a/bsfs/schema/types.py b/bsfs/schema/types.py index 54a7e99..5834df8 100644 --- a/bsfs/schema/types.py +++ b/bsfs/schema/types.py @@ -1,13 +1,9 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import typing # bsfs imports +from bsfs.namespace import ns from bsfs.utils import errors, URI, typename # exports @@ -15,6 +11,7 @@ __all__: typing.Sequence[str] = ( 'Literal', 'Node', 'Predicate', + 'Feature', ) @@ -99,9 +96,11 @@ class _Type(): self, uri: URI, parent: typing.Optional['_Type'] = None, + **annotations: typing.Any, ): - self.uri = uri + self.uri = URI(uri) self.parent = parent + self.annotations = annotations def parents(self) -> typing.Generator['_Type', None, None]: """Generate a list of parent nodes.""" @@ -110,9 +109,17 @@ class _Type(): yield curr curr = curr.parent - def get_child(self, uri: URI, **kwargs): + def child( + self, + uri: URI, + **kwargs, + ): """Return a child of the current class.""" - return type(self)(uri, self, **kwargs) + return type(self)( + uri=uri, + parent=self, + **kwargs + ) def __str__(self) -> str: return f'{typename(self)}({self.uri})' @@ -138,8 +145,10 @@ class _Type(): def __lt__(self, other: typing.Any) -> bool: """Return True iff *self* is a true subclass of *other*.""" - if not type(self) is type(other): # type mismatch # pylint: disable=unidiomatic-typecheck + if not isinstance(other, _Type): return NotImplemented + if not isinstance(other, type(self)): # FIXME: necessary? + return False if self.uri == other.uri: # equivalence return False if self in other.parents(): # superclass @@ -151,8 +160,10 @@ class _Type(): def __le__(self, other: typing.Any) -> bool: """Return True iff *self* is equivalent or a subclass of *other*.""" - if not type(self) is type(other): # type mismatch # pylint: disable=unidiomatic-typecheck + if not isinstance(other, _Type): return NotImplemented + if not isinstance(other, type(self)): # FIXME: necessary? + return False if self.uri == other.uri: # equivalence return True if self in other.parents(): # superclass @@ -164,8 +175,10 @@ class _Type(): def __gt__(self, other: typing.Any) -> bool: """Return True iff *self* is a true superclass of *other*.""" - if not type(self) is type(other): # type mismatch # pylint: disable=unidiomatic-typecheck + if not isinstance(other, _Type): return NotImplemented + if not isinstance(other, type(self)): # FIXME: necessary? + return False if self.uri == other.uri: # equivalence return False if self in other.parents(): # superclass @@ -177,8 +190,10 @@ class _Type(): def __ge__(self, other: typing.Any) -> bool: """Return True iff *self* is eqiuvalent or a superclass of *other*.""" - if not type(self) is type(other): # type mismatch # pylint: disable=unidiomatic-typecheck + if not isinstance(other, _Type): return NotImplemented + if not isinstance(other, type(self)): # FIXME: necessary? + return False if self.uri == other.uri: # equivalence return True if self in other.parents(): # superclass @@ -189,32 +204,95 @@ class _Type(): return False -class _Vertex(_Type): +class Vertex(_Type): """Graph vertex types. Can be a Node or a Literal.""" - def __init__(self, uri: URI, parent: typing.Optional['_Vertex']): - super().__init__(uri, parent) + parent: typing.Optional['Vertex'] + def __init__(self, uri: URI, parent: typing.Optional['Vertex'], **kwargs): + super().__init__(uri, parent, **kwargs) -class Node(_Vertex): +class Node(Vertex): """Node type.""" - def __init__(self, uri: URI, parent: typing.Optional['Node']): - super().__init__(uri, parent) + parent: typing.Optional['Node'] + def __init__(self, uri: URI, parent: typing.Optional['Node'], **kwargs): + super().__init__(uri, parent, **kwargs) -class Literal(_Vertex): +class Literal(Vertex): """Literal type.""" - def __init__(self, uri: URI, parent: typing.Optional['Literal']): - super().__init__(uri, parent) + parent: typing.Optional['Literal'] + def __init__(self, uri: URI, parent: typing.Optional['Literal'], **kwargs): + super().__init__(uri, parent, **kwargs) + + +class Feature(Literal): + """Feature type.""" + + # Number of feature vector dimensions. + dimension: int + + # Feature vector datatype. + dtype: URI + + # Distance measure to compare feature vectors. + distance: URI + + def __init__( + self, + # Type members + uri: URI, + parent: typing.Optional[Literal], + # Feature members + dimension: int, + dtype: URI, + distance: URI, + **kwargs, + ): + super().__init__(uri, parent, **kwargs) + self.dimension = int(dimension) + self.dtype = URI(dtype) + self.distance = URI(distance) + + def __hash__(self) -> int: + return hash((super().__hash__(), self.dimension, self.dtype, self.distance)) + + def __eq__(self, other: typing.Any) -> bool: + return super().__eq__(other) \ + and self.dimension == other.dimension \ + and self.dtype == other.dtype \ + and self.distance == other.distance + def child( + self, + uri: URI, + dimension: typing.Optional[int] = None, + dtype: typing.Optional[URI] = None, + distance: typing.Optional[URI] = None, + **kwargs, + ): + """Return a child of the current class.""" + if dimension is None: + dimension = self.dimension + if dtype is None: + dtype = self.dtype + if distance is None: + distance = self.distance + return super().child( + uri=uri, + dimension=dimension, + dtype=dtype, + distance=distance, + **kwargs, + ) class Predicate(_Type): - """Predicate type.""" + """Predicate base type.""" # source type. domain: Node # destination type. - range: typing.Optional[typing.Union[Node, Literal]] + range: Vertex # maximum cardinality of type. unique: bool @@ -226,22 +304,23 @@ class Predicate(_Type): parent: typing.Optional['Predicate'], # Predicate members domain: Node, - range: typing.Optional[typing.Union[Node, Literal]], # pylint: disable=redefined-builtin + range: Vertex, # pylint: disable=redefined-builtin unique: bool, + **kwargs, ): # check arguments if not isinstance(domain, Node): raise TypeError(domain) - if range is not None and not isinstance(range, Node) and not isinstance(range, Literal): + if range != ROOT_VERTEX and not isinstance(range, (Node, Literal)): raise TypeError(range) # initialize - super().__init__(uri, parent) + super().__init__(uri, parent, **kwargs) self.domain = domain self.range = range - self.unique = unique + self.unique = bool(unique) def __hash__(self) -> int: - return hash((super().__hash__(), self.domain, self.range, self.unique)) + return hash((super().__hash__(), self.domain, self.unique, self.range)) def __eq__(self, other: typing.Any) -> bool: return super().__eq__(other) \ @@ -249,11 +328,11 @@ class Predicate(_Type): and self.range == other.range \ and self.unique == other.unique - def get_child( + def child( self, uri: URI, domain: typing.Optional[Node] = None, - range: typing.Optional[_Vertex] = None, # pylint: disable=redefined-builtin + range: typing.Optional[Vertex] = None, # pylint: disable=redefined-builtin unique: typing.Optional[bool] = None, **kwargs, ): @@ -264,13 +343,73 @@ class Predicate(_Type): raise errors.ConsistencyError(f'{domain} must be a subclass of {self.domain}') if range is None: range = self.range - if range is None: # inherited range from ns.bsfs.Predicate - raise ValueError('range must be defined by the parent or argument') - if self.range is not None and not range <= self.range: + # NOTE: The root predicate has a Vertex as range, which is neither a parent of the root + # Node nor Literal. Hence, that test is skipped since a child should be allowed to + # specialize from Vertex to anything. + if self.range != ROOT_VERTEX and not range <= self.range: raise errors.ConsistencyError(f'{range} must be a subclass of {self.range}') if unique is None: unique = self.unique - return super().get_child(uri, domain=domain, range=range, unique=unique, **kwargs) + return super().child( + uri=uri, + domain=domain, + range=range, + unique=unique, + **kwargs + ) + + +# essential vertices +ROOT_VERTEX = Vertex( + uri=ns.bsfs.Vertex, + parent=None, + ) + +ROOT_NODE = Node( + uri=ns.bsfs.Node, + parent=None, + ) + +ROOT_LITERAL = Literal( + uri=ns.bsfs.Literal, + parent=None, + ) + +ROOT_BLOB = Literal( + uri=ns.bsl.BinaryBlob, + parent=ROOT_LITERAL, + ) + +ROOT_NUMBER = Literal( + uri=ns.bsl.Number, + parent=ROOT_LITERAL, + ) + +ROOT_TIME = Literal( + uri=ns.bsl.Time, + parent=ROOT_LITERAL, + ) + +ROOT_ARRAY = Literal( + uri=ns.bsl.Array, + parent=ROOT_LITERAL, + ) +ROOT_FEATURE = Feature( + uri=ns.bsl.Array.Feature, + parent=ROOT_ARRAY, + dimension=1, + dtype=ns.bsfs.dtype().f16, + distance=ns.bsd.euclidean, + ) + +# essential predicates +ROOT_PREDICATE = Predicate( + uri=ns.bsfs.Predicate, + parent=None, + domain=ROOT_NODE, + range=ROOT_VERTEX, + unique=False, + ) ## EOF ## diff --git a/bsfs/triple_store/__init__.py b/bsfs/triple_store/__init__.py index fb5a8a9..79a2887 100644 --- a/bsfs/triple_store/__init__.py +++ b/bsfs/triple_store/__init__.py @@ -1,9 +1,4 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import typing diff --git a/bsfs/triple_store/base.py b/bsfs/triple_store/base.py index 6561262..58b5670 100644 --- a/bsfs/triple_store/base.py +++ b/bsfs/triple_store/base.py @@ -1,16 +1,12 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import abc import typing # inner-module imports +from bsfs.query import ast from bsfs.utils import URI, typename -import bsfs.schema as _schema +import bsfs.schema as bsc # exports __all__: typing.Sequence[str] = ( @@ -81,12 +77,12 @@ class TripleStoreBase(abc.ABC): @property @abc.abstractmethod - def schema(self) -> _schema.Schema: + def schema(self) -> bsc.Schema: """Return the store's local schema.""" @schema.setter @abc.abstractmethod - def schema(self, schema: _schema.Schema): + def schema(self, schema: bsc.Schema): """Migrate to new schema by adding or removing class definitions. Commits before and after the migration. @@ -109,9 +105,30 @@ class TripleStoreBase(abc.ABC): """ @abc.abstractmethod + def get( + self, + node_type: bsc.Node, + filter: typing.Optional[ast.filter.FilterExpression] = None, # pylint: disable=redefined-builtin + ) -> typing.Iterator[URI]: + """Return guids of nodes of type *node_type* that match the *filter*. + Return all guids of the respective type if *filter* is None. + """ + + @abc.abstractmethod + def fetch( + self, + node_type: bsc.Node, + filter: ast.filter.FilterExpression, # pylint: disable=redefined-builtin + fetch: ast.fetch.FetchExpression, + ) -> typing.Iterator[typing.Tuple[URI, str, typing.Any]]: + """Return (guid, name, value) triples where the guid is determined by the *filter* + query and the name matches the *fetch* query. + """ + + @abc.abstractmethod def exists( self, - node_type: _schema.Node, + node_type: bsc.Node, guids: typing.Iterable[URI], ) -> typing.Iterable[URI]: """Return those *guids* that exist and have type *node_type* or a subclass thereof.""" @@ -119,7 +136,7 @@ class TripleStoreBase(abc.ABC): @abc.abstractmethod def create( self, - node_type: _schema.Node, + node_type: bsc.Node, guids: typing.Iterable[URI], ): """Create *guid* nodes with type *subject*.""" @@ -127,9 +144,9 @@ class TripleStoreBase(abc.ABC): @abc.abstractmethod def set( self, - node_type: _schema.Node, # FIXME: is the node_type even needed? Couldn't I infer from the predicate? + node_type: bsc.Node, # FIXME: is the node_type even needed? Couldn't I infer from the predicate? guids: typing.Iterable[URI], - predicate: _schema.Predicate, + predicate: bsc.Predicate, values: typing.Iterable[typing.Any], ): """Add triples to the graph. diff --git a/bsfs/triple_store/sparql/__init__.py b/bsfs/triple_store/sparql/__init__.py new file mode 100644 index 0000000..cfa2732 --- /dev/null +++ b/bsfs/triple_store/sparql/__init__.py @@ -0,0 +1,13 @@ + +# imports +import typing + +# inner-module imports +from .sparql import SparqlStore + +# exports +__all__: typing.Sequence[str] = ( + 'SparqlStore', + ) + +## EOF ## diff --git a/bsfs/triple_store/sparql/distance.py b/bsfs/triple_store/sparql/distance.py new file mode 100644 index 0000000..2c2f355 --- /dev/null +++ b/bsfs/triple_store/sparql/distance.py @@ -0,0 +1,51 @@ + +# standard imports +import typing + +# external imports +import numpy as np + +# bsfs imports +from bsfs.namespace import ns + +# constants +EPS = 1e-9 + +# exports +__all__: typing.Sequence[str] = ( + 'DISTANCE_FU', + ) + + +## code ## + +def euclid(fst, snd) -> float: + """Euclidean distance (l2 norm).""" + fst = np.array(fst) + snd = np.array(snd) + return float(np.linalg.norm(fst - snd)) + +def cosine(fst, snd) -> float: + """Cosine distance.""" + fst = np.array(fst) + snd = np.array(snd) + if (fst == snd).all(): + return 0.0 + nrm0 = np.linalg.norm(fst) + nrm1 = np.linalg.norm(snd) + return float(1.0 - np.dot(fst, snd) / (nrm0 * nrm1 + EPS)) + +def manhatten(fst, snd) -> float: + """Manhatten (cityblock) distance (l1 norm).""" + fst = np.array(fst) + snd = np.array(snd) + return float(np.abs(fst - snd).sum()) + +# Known distance functions. +DISTANCE_FU = { + ns.bsd.euclidean: euclid, + ns.bsd.cosine: cosine, + ns.bsd.manhatten: manhatten, +} + +## EOF ## diff --git a/bsfs/triple_store/sparql/parse_fetch.py b/bsfs/triple_store/sparql/parse_fetch.py new file mode 100644 index 0000000..fab8173 --- /dev/null +++ b/bsfs/triple_store/sparql/parse_fetch.py @@ -0,0 +1,104 @@ + +# standard imports +import typing + +# bsfs imports +from bsfs import schema as bsc +from bsfs.query import ast +from bsfs.utils import errors + +# inner-module imports +from .utils import GenHopName, Query + +# exports +__all__: typing.Sequence[str] = ( + 'Fetch', + ) + + +## code ## + +class Fetch(): + """Translate `bsfs.query.ast.fetch` structures into Sparql queries.""" + + def __init__(self, schema): + self.schema = schema + self.ngen = GenHopName(prefix='?fch') + + def __call__( + self, + root_type: bsc.Node, + root: ast.fetch.FetchExpression, + ) -> Query: + """ + """ + # check root_type + if not isinstance(root_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {root_type}') + if root_type not in self.schema.nodes(): + raise errors.ConsistencyError(f'node {root_type} is not in the schema') + # parse root + terms, expr = self._parse_fetch_expression(root_type, root, '?ent') + # assemble query + return Query( + root_type=root_type.uri, + root_head='?ent', + select=terms, + where=expr, + ) + + def _parse_fetch_expression( + self, + node_type: bsc.Vertex, + node: ast.fetch.FetchExpression, + head: str, + ): + """Route *node* to the handler of the respective FetchExpression subclass.""" + if isinstance(node, ast.fetch.All): + return self._all(node_type, node, head) + if isinstance(node, ast.fetch.Fetch): + return self._fetch(node_type, node, head) + if isinstance(node, ast.fetch.Node): + return self._node(node_type, node, head) + if isinstance(node, ast.fetch.Value): + return self._value(node_type, node, head) + if isinstance(node, ast.fetch.This): + return self._this(node_type, node, head) + # invalid node + raise errors.BackendError(f'expected fetch expression, found {node}') + + def _all(self, node_type: bsc.Vertex, node: ast.fetch.All, head: str): + # child expressions + terms, exprs = zip(*[self._parse_fetch_expression(node_type, expr, head) for expr in node]) + terms = {term for sub in terms for term in sub} + exprs = ' .\n'.join({expr for expr in exprs if len(expr.strip()) > 0}) + return terms, exprs + + def _fetch(self, node_type: bsc.Vertex, node: ast.fetch.Fetch, head: str): # pylint: disable=unused-argument # (node_type) + # child expressions + rng = self.schema.predicate(node.predicate).range + nexthead = next(self.ngen) + terms, expr = self._parse_fetch_expression(rng, node.expr, nexthead) + return terms, f'OPTIONAL{{ {head} <{node.predicate}> {nexthead} .\n {expr} }}' + + def _node(self, node_type: bsc.Vertex, node: ast.fetch.Node, head: str): # pylint: disable=unused-argument # (node_type) + if f'?{node.name}'.startswith(self.ngen.prefix): + raise errors.BackendError(f'Node name must start with {self.ngen.prefix}') + # compose and return statement + term = next(self.ngen) + return {(term, node.name)}, f'OPTIONAL{{ {head} <{node.predicate}> {term} }}' + + def _value(self, node_type: bsc.Vertex, node: ast.fetch.Value, head: str): # pylint: disable=unused-argument # (node_type) + if f'?{node.name}'.startswith(self.ngen.prefix): + raise errors.BackendError(f'Value name must start with {self.ngen.prefix}') + # compose and return statement + term = next(self.ngen) + return {(term, node.name)}, f'OPTIONAL{{ {head} <{node.predicate}> {term} }}' + + def _this(self, node_type: bsc.Vertex, node: ast.fetch.This, head: str): # pylint: disable=unused-argument # (node_type) + if f'?{node.name}'.startswith(self.ngen.prefix): + raise errors.BackendError(f'This name must start with {self.ngen.prefix}') + # compose and return statement + return {(head, node.name)}, '' + +## EOF ## diff --git a/bsfs/triple_store/sparql/parse_filter.py b/bsfs/triple_store/sparql/parse_filter.py new file mode 100644 index 0000000..2f5a25b --- /dev/null +++ b/bsfs/triple_store/sparql/parse_filter.py @@ -0,0 +1,316 @@ + +# imports +import operator +import typing + +# external imports +import rdflib + +# bsfs imports +from bsfs import schema as bsc +from bsfs.namespace import ns +from bsfs.query import ast +from bsfs.utils import URI, errors + +# inner-module imports +from .distance import DISTANCE_FU +from .utils import GenHopName, Query + +# exports +__all__: typing.Sequence[str] = ( + 'Filter', + ) + + +## code ## + +class Filter(): + """Translate `bsfs.query.ast.filter` structures into Sparql queries.""" + + # Current schema to validate against. + schema: bsc.Schema + + # Generator that produces unique symbol names. + ngen: GenHopName + + def __init__(self, graph, schema): + self.graph = graph + self.schema = schema + self.ngen = GenHopName(prefix='?flt') + + def __call__( + self, + root_type: bsc.Node, + root: typing.Optional[ast.filter.FilterExpression] = None, + ) -> Query: + """ + """ + # check root_type + if not isinstance(root_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {root_type}') + if root_type not in self.schema.nodes(): + raise errors.ConsistencyError(f'node {root_type} is not in the schema') + # parse root + if root is None: + cond = '' + else: + cond = self._parse_filter_expression(root_type, root, '?ent') + # assemble query + return Query( + root_type=root_type.uri, + root_head='?ent', + where=cond, + ) + + def _parse_filter_expression( + self, + type_: bsc.Vertex, + node: ast.filter.FilterExpression, + head: str, + ) -> str: + """Route *node* to the handler of the respective FilterExpression subclass.""" + if isinstance(node, ast.filter.Is): + return self._is(type_, node, head) + if isinstance(node, ast.filter.Not): + return self._not(type_, node, head) + if isinstance(node, ast.filter.Has): + return self._has(type_, node, head) + if isinstance(node, ast.filter.Distance): + return self._distance(type_, node, head) + if isinstance(node, ast.filter.Any): + return self._any(type_, node, head) + if isinstance(node, ast.filter.All): + return self._all(type_, node, head) + if isinstance(node, ast.filter.And): + return self._and(type_, node, head) + if isinstance(node, ast.filter.Or): + return self._or(type_, node, head) + if isinstance(node, ast.filter.Equals): + return self._equals(type_, node, head) + if isinstance(node, ast.filter.Substring): + return self._substring(type_, node, head) + if isinstance(node, ast.filter.StartsWith): + return self._starts_with(type_, node, head) + if isinstance(node, ast.filter.EndsWith): + return self._ends_with(type_, node, head) + if isinstance(node, ast.filter.LessThan): + return self._less_than(type_, node, head) + if isinstance(node, ast.filter.GreaterThan): + return self._greater_than(type_, node, head) + # invalid node + raise errors.BackendError(f'expected filter expression, found {node}') + + def _parse_predicate_expression( + self, + type_: bsc.Vertex, + node: ast.filter.PredicateExpression + ) -> typing.Tuple[str, bsc.Vertex]: + """Route *node* to the handler of the respective PredicateExpression subclass.""" + if isinstance(node, ast.filter.Predicate): + return self._predicate(type_, node) + if isinstance(node, ast.filter.OneOf): + return self._one_of(type_, node) + # invalid node + raise errors.BackendError(f'expected predicate expression, found {node}') + + def _one_of(self, node_type: bsc.Vertex, node: ast.filter.OneOf) -> typing.Tuple[str, bsc.Vertex]: + """ + """ + if not isinstance(node_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {node_type}') + # walk through predicates + suburi, rng = set(), None + for pred in node: # OneOf guarantees at least one expression + puri, subrng = self._parse_predicate_expression(node_type, pred) + # track predicate uris + suburi.add(puri) + # check for more generic range + if rng is None or subrng > rng: + rng = subrng + # check range consistency + if not subrng <= rng and not subrng >= rng: + raise errors.ConsistencyError(f'ranges {subrng} and {rng} are not related') + # return joint predicate expression and next range + # OneOf guarantees at least one expression, rng is always a bsc.Vertex. + # mypy does not realize this, hence we ignore the warning. + return '|'.join(suburi), rng # type: ignore [return-value] + + def _predicate(self, node_type: bsc.Vertex, node: ast.filter.Predicate) -> typing.Tuple[str, bsc.Vertex]: + """ + """ + # check node_type + if not isinstance(node_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {node_type}') + # fetch predicate and its uri + puri = node.predicate + # get and check predicate, domain, and range + if not self.schema.has_predicate(puri): + raise errors.ConsistencyError(f'predicate {puri} is not in the schema') + pred = self.schema.predicate(puri) + if not isinstance(pred.range, (bsc.Node, bsc.Literal)): + raise errors.BackendError(f'the range of predicate {pred} is undefined') + dom, rng = pred.domain, pred.range + # encapsulate predicate uri + uri_str = f'<{puri}>' + # apply reverse flag + if node.reverse: + uri_str = '^' + uri_str + dom, rng = rng, dom # type: ignore [assignment] # variable re-use confuses mypy + # check path consistency + if not node_type <= dom: + raise errors.ConsistencyError(f'expected type {dom} or subtype thereof, found {node_type}') + # return predicate URI and next node type + return uri_str, rng + + def _any(self, node_type: bsc.Vertex, node: ast.filter.Any, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {node_type}') + # parse predicate + pred, next_type = self._parse_predicate_expression(node_type, node.predicate) + # parse expression + nexthead = next(self.ngen) + expr = self._parse_filter_expression(next_type, node.expr, nexthead) + # combine results + return f'{head} {pred} {nexthead} . {expr}' + + def _all(self, node_type: bsc.Vertex, node: ast.filter.All, head: str) -> str: + """ + """ + # NOTE: All(P, E) := Not(Any(P, Not(E))) and EXISTS(P, ?) + if not isinstance(node_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {node_type}') + # parse rewritten ast + expr = self._parse_filter_expression(node_type, + ast.filter.Not( + ast.filter.Any(node.predicate, + ast.filter.Not(node.expr))), head) + # parse predicate for existence constraint + pred, _ = self._parse_predicate_expression(node_type, node.predicate) + temphead = next(self.ngen) + # return existence and rewritten expression + return f'FILTER EXISTS {{ {head} {pred} {temphead} }} . ' + expr + + def _and(self, node_type: bsc.Vertex, node: ast.filter.And, head: str) -> str: + """ + """ + sub = [self._parse_filter_expression(node_type, expr, head) for expr in node] + return ' . '.join(sub) + + def _or(self, node_type: bsc.Vertex, node: ast.filter.Or, head: str) -> str: + """ + """ + # potential special case optimization: + # * ast: Or(Equals('foo'), Equals('bar'), ...) + # * query: VALUES ?head { "value1"^^<...> "value2"^^<...> "value3"^<...> ... } + sub = [self._parse_filter_expression(node_type, expr, head) for expr in node] + sub = ['{' + expr + '}' for expr in sub] + return ' UNION '.join(sub) + + def _not(self, node_type: bsc.Vertex, node: ast.filter.Not, head: str) -> str: + """ + """ + expr = self._parse_filter_expression(node_type, node.expr, head) + if isinstance(node_type, bsc.Literal): + return f'MINUS {{ {expr} }}' + # NOTE: for bsc.Node types, we must include at least one expression in the body of MINUS, + # otherwise the connection between the context and body of MINUS is lost. + # The simplest (and non-interfering) choice is a type statement. + return f'MINUS {{ {head} <{ns.rdf.type}>/<{ns.rdfs.subClassOf}>* <{node_type.uri}> . {expr} }}' + + def _has(self, node_type: bsc.Vertex, node: ast.filter.Has, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {node_type}') + # parse predicate + pred, _ = self._parse_predicate_expression(node_type, node.predicate) + # get new heads + inner = next(self.ngen) + outer = next(self.ngen) + # predicate count expression (fetch number of predicates at *head*) + num_preds = f'{{ SELECT (COUNT(distinct {inner}) as {outer}) WHERE {{ {head} {pred} {inner} }} }}' + # count expression + count_bounds = self._parse_filter_expression(self.schema.literal(ns.xsd.integer), node.count, outer) + # combine + return num_preds + ' . ' + count_bounds + + def _distance(self, node_type: bsc.Vertex, node: ast.filter.Distance, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Feature): + raise errors.BackendError(f'expected Feature, found {node_type}') + if len(node.reference) != node_type.dimension: + raise errors.ConsistencyError( + f'reference has dimension {len(node.reference)}, expected {node_type.dimension}') + # get distance metric + dist = DISTANCE_FU[node_type.distance] + # get operator + cmp = operator.lt if node.strict else operator.le + # get candidate values + candidates = { + f'"{cand}"^^<{node_type.uri}>' + for cand + in self.graph.objects() + if isinstance(cand, rdflib.Literal) + and cand.datatype == rdflib.URIRef(node_type.uri) + and cmp(dist(cand.value, node.reference), node.threshold) + } + # combine candidate values + values = ' '.join(candidates) if len(candidates) else f'"impossible value"^^<{ns.xsd.string}>' + # return sparql fragment + return f'VALUES {head} {{ {values} }}' + + def _is(self, node_type: bsc.Vertex, node: ast.filter.Is, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Node): + raise errors.BackendError(f'expected Node, found {node_type}') + return f'VALUES {head} {{ <{URI(node.value)}> }}' + + def _equals(self, node_type: bsc.Vertex, node: ast.filter.Equals, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Literal): + raise errors.BackendError(f'expected Literal, found {node}') + return f'VALUES {head} {{ "{node.value}"^^<{node_type.uri}> }}' + + def _substring(self, node_type: bsc.Vertex, node: ast.filter.Substring, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Literal): + raise errors.BackendError(f'expected Literal, found {node_type}') + return f'FILTER contains(str({head}), "{node.value}")' + + def _starts_with(self, node_type: bsc.Vertex, node: ast.filter.StartsWith, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Literal): + raise errors.BackendError(f'expected Literal, found {node_type}') + return f'FILTER strstarts(str({head}), "{node.value}")' + + def _ends_with(self, node_type: bsc.Vertex, node: ast.filter.EndsWith, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Literal): + raise errors.BackendError(f'expected Literal, found {node_type}') + return f'FILTER strends(str({head}), "{node.value}")' + + def _less_than(self, node_type: bsc.Vertex, node: ast.filter.LessThan, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Literal): + raise errors.BackendError(f'expected Literal, found {node_type}') + equality = '=' if not node.strict else '' + return f'FILTER ({head} <{equality} {float(node.threshold)})' + + def _greater_than(self, node_type: bsc.Vertex, node: ast.filter.GreaterThan, head: str) -> str: + """ + """ + if not isinstance(node_type, bsc.Literal): + raise errors.BackendError(f'expected Literal, found {node_type}') + equality = '=' if not node.strict else '' + return f'FILTER ({head} >{equality} {float(node.threshold)})' + +## EOF ## diff --git a/bsfs/triple_store/sparql.py b/bsfs/triple_store/sparql/sparql.py index 7516dff..99e67d6 100644 --- a/bsfs/triple_store/sparql.py +++ b/bsfs/triple_store/sparql/sparql.py @@ -1,20 +1,23 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports +import base64 import itertools import typing + +# external imports import rdflib # bsfs imports from bsfs import schema as bsc +from bsfs.namespace import ns +from bsfs.query import ast from bsfs.utils import errors, URI # inner-module imports -from . import base +from . import parse_fetch +from . import parse_filter +from .. import base +from .distance import DISTANCE_FU # exports @@ -25,6 +28,8 @@ __all__: typing.Sequence[str] = ( ## code ## +rdflib.term.bind(ns.bsl.BinaryBlob, bytes, constructor=base64.b64decode) + class _Transaction(): """Lightweight rdflib transactions for in-memory databases.""" @@ -85,11 +90,19 @@ class SparqlStore(base.TripleStoreBase): # The local schema. _schema: bsc.Schema + # Filter parser + _filter_parser: parse_filter.Filter + + # Fetch parser + _fetch_parser: parse_fetch.Fetch + def __init__(self): super().__init__(None) self._graph = rdflib.Graph() self._transaction = _Transaction(self._graph) - self._schema = bsc.Schema.Empty() + self._schema = bsc.Schema(literals={bsc.ROOT_NUMBER.child(ns.xsd.integer)}) + self._filter_parser = parse_filter.Filter(self._graph, self._schema) + self._fetch_parser = parse_fetch.Fetch(self._schema) # NOTE: mypy and pylint complain about the **kwargs not being listed (contrasting super) # However, not having it here is clearer since it's explicit that there are no arguments. @@ -115,6 +128,16 @@ class SparqlStore(base.TripleStoreBase): # check compatibility: No contradicting definitions if not self.schema.consistent_with(schema): raise errors.ConsistencyError(f'{schema} is inconsistent with {self.schema}') + # check distance functions of features + invalid = { + (cand.uri, cand.distance) + for cand + in schema.literals() + if isinstance(cand, bsc.Feature) and cand.distance not in DISTANCE_FU} + if len(invalid) > 0: + cand, dist = zip(*invalid) + raise errors.UnsupportedError( + f'unknown distance function {",".join(dist)} in feature {", ".join(cand)}') # commit the current transaction self.commit() @@ -126,10 +149,17 @@ class SparqlStore(base.TripleStoreBase): # get deleted classes sub = self.schema - schema - # remove predicate instances for pred in sub.predicates: + # remove predicate instances for src, trg in self._graph.subject_objects(rdflib.URIRef(pred.uri)): self._transaction.remove((src, rdflib.URIRef(pred.uri), trg)) + # remove predicate definition + if pred.parent is not None: # NOTE: there shouldn't be any predicate w/o parent + self._transaction.remove(( + rdflib.URIRef(pred.uri), + rdflib.RDFS.subClassOf, + rdflib.URIRef(pred.parent.uri), + )) # remove node instances for node in sub.nodes: @@ -143,15 +173,82 @@ class SparqlStore(base.TripleStoreBase): self._transaction.remove((inst, pred, trg)) # remove instance self._transaction.remove((inst, rdflib.RDF.type, rdflib.URIRef(node.uri))) - - # NOTE: Nothing to do for literals + # remove node definition + if node.parent is not None: # NOTE: there shouldn't be any node w/o parent + self._transaction.remove(( + rdflib.URIRef(node.uri), + rdflib.RDFS.subClassOf, + rdflib.URIRef(node.parent.uri), + )) + + for lit in sub.literals: + # remove literal definition + if lit.parent is not None: # NOTE: there shouldn't be any literal w/o parent + self._transaction.remove(( + rdflib.URIRef(lit.uri), + rdflib.RDFS.subClassOf, + rdflib.URIRef(lit.parent.uri), + )) + + # add predicate, node, and literal hierarchies to the graph + for itm in itertools.chain(schema.predicates(), schema.nodes(), schema.literals()): + if itm.parent is not None: + self._transaction.add((rdflib.URIRef(itm.uri), rdflib.RDFS.subClassOf, rdflib.URIRef(itm.parent.uri))) # commit instance changes self.commit() # migrate schema self._schema = schema + self._filter_parser.schema = schema + self._fetch_parser.schema = schema + def fetch( + self, + node_type: bsc.Node, + filter: ast.filter.FilterExpression, # pylint: disable=redefined-builtin + fetch: ast.fetch.FetchExpression, + ) -> typing.Iterator[typing.Tuple[URI, str, typing.Any]]: + if node_type not in self.schema.nodes(): + raise errors.ConsistencyError(f'{node_type} is not defined in the schema') + if not isinstance(filter, ast.filter.FilterExpression): + raise TypeError(filter) + if not isinstance(fetch, ast.fetch.FetchExpression): + raise TypeError(fetch) + # compose a query from fetch and filter ast + query = self._filter_parser(node_type, filter) + query += self._fetch_parser(node_type, fetch) + # run query + emitted = set() + for result in query(self._graph): + guid = URI(result[0]) + for name, raw in zip(query.names, result[1:]): + if raw is None: # undefined value + continue + if isinstance(raw, rdflib.Literal): + value = raw.value + else: + value = URI(raw) + # emit triple + triple = (guid, name, value) + if triple not in emitted: # FIXME: needs a better solution! + emitted.add(triple) + yield guid, name, value + + def get( + self, + node_type: bsc.Node, + filter: typing.Optional[ast.filter.FilterExpression] = None, # pylint: disable=redefined-builtin + ) -> typing.Iterator[URI]: + if node_type not in self.schema.nodes(): + raise errors.ConsistencyError(f'{node_type} is not defined in the schema') + if filter is not None and not isinstance(filter, ast.filter.FilterExpression): + raise TypeError(filter) + # compose query + query = self._filter_parser(node_type, filter) + # run query + for guid, in query(self._graph): + yield URI(guid) def _has_type(self, subject: URI, node_type: bsc.Node) -> bool: """Return True if *subject* is a node of class *node_type* or a subclass thereof.""" @@ -187,7 +284,7 @@ class SparqlStore(base.TripleStoreBase): raise errors.ConsistencyError(f'{node_type} is not defined in the schema') # check and create guids for guid in guids: - subject = rdflib.URIRef(guid) + subject = rdflib.URIRef(URI(guid)) # check node existence if (subject, rdflib.RDF.type, None) in self._graph: # FIXME: node exists and may have a different type! ignore? raise? report? @@ -226,7 +323,7 @@ class SparqlStore(base.TripleStoreBase): raise errors.InstanceError(inconsistent) # check guids # FIXME: Fail or skip inexistent nodes? - guids = set(guids) + guids = {URI(guid) for guid in guids} inconsistent = {guid for guid in guids if not self._has_type(guid, node_type)} if len(inconsistent) > 0: raise errors.InstanceError(inconsistent) @@ -237,7 +334,11 @@ class SparqlStore(base.TripleStoreBase): guid = rdflib.URIRef(guid) # convert value if isinstance(predicate.range, bsc.Literal): - value = rdflib.Literal(value, datatype=rdflib.URIRef(predicate.range.uri)) + dtype = rdflib.URIRef(predicate.range.uri) + if predicate.range <= self.schema.literal(ns.bsl.BinaryBlob): + dtype = rdflib.URIRef(ns.bsl.BinaryBlob) + value = base64.b64encode(value) + value = rdflib.Literal(value, datatype=dtype) elif isinstance(predicate.range, bsc.Node): value = rdflib.URIRef(value) else: diff --git a/bsfs/triple_store/sparql/utils.py b/bsfs/triple_store/sparql/utils.py new file mode 100644 index 0000000..38062c2 --- /dev/null +++ b/bsfs/triple_store/sparql/utils.py @@ -0,0 +1,137 @@ + +# standard imports +import typing + +# external imports +import rdflib + +# bsfs imports +from bsfs.namespace import ns +from bsfs.utils import typename + +# exports +__all__: typing.Sequence[str] = ( + 'GenHopName', + 'Query', + ) + + +## code ## + +class GenHopName(): + """Generator that produces a new unique symbol name with each iteration.""" + + # Symbol name prefix. + prefix: str + + # Current counter. + curr: int + + def __init__(self, prefix: str = '?hop', start: int = 0): + self.prefix = prefix + self.curr = start - 1 + + def __next__(self): + """Generate and return the next unique name.""" + self.curr += 1 + return self.prefix + str(self.curr) + + +class Query(): + """Hold, manage, and complete partial Sparql queries.""" + + # root node type URI. + root_type: str + + # root node variable name. + root_head: str + + # (head, name) tuples (w/o root) + select: typing.Tuple[typing.Tuple[str, str], ...] + + # where statements. + where: str + + def __init__( + self, + root_type: str, + root_head: str = '?ent', + select: typing.Optional[typing.Iterable[typing.Tuple[str, str]]] = None, + where: typing.Optional[str] = None, + ): + # check arguments + if select is None: + select = [] + if where is None: + where = '' + # set members + self.root_type = root_type + self.root_head = root_head + self.select = tuple(select) # tuple ensures presistent order + self.where = where.strip() + + def __str__(self) -> str: + return self.query + + def __repr__(self) -> str: + return f'{typename(self)}({self.root_type}, {self.root_head}, {self.select}, {self.where})' + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) \ + and self.root_type == other.root_type \ + and self.root_head == other.root_head \ + and self.select == other.select \ + and self.where == other.where + + def __hash__(self) -> int: + return hash((type(self), self.root_type, self.root_head, self.select, self.where)) + + def __add__(self, other: typing.Any) -> 'Query': + # check other's type + if not isinstance(other, type(self)): + return NotImplemented + # check query compatibility + if not self.root_type == other.root_type: + raise ValueError(other) + if not self.root_head == other.root_head: + raise ValueError(other) + # combine selections + select = self.select + other.select + # combine conditions + conds = [] + if self.where != '': + conds.append(self.where) + if other.where != '': + conds.append(other.where) + where = ' . '.join(conds) + # return new query + return Query( + root_type=self.root_type, + root_head=self.root_head, + select=select, + where=where, + ) + + @property + def names(self) -> typing.Tuple[str, ...]: + """Return a tuple of selected variable names, excluding the root.""" + return tuple(name for _, name in self.select) + + @property + def query(self) -> str: + """Return an executable sparql query.""" + select = ' '.join(f'({head} as ?{name})' for head, name in self.select) + return f''' + SELECT DISTINCT {self.root_head} {select} + WHERE {{ + {self.root_head} <{ns.rdf.type}>/<{ns.rdfs.subClassOf}>* <{self.root_type}> . + {self.where} + }} + ORDER BY str({self.root_head}) + ''' + + def __call__(self, graph: rdflib.Graph) -> rdflib.query.Result: + """Execute the query on a *graph* and return the query result.""" + return graph.query(self.query) + +## EOF ## diff --git a/bsfs/utils/__init__.py b/bsfs/utils/__init__.py index 94680ee..d497645 100644 --- a/bsfs/utils/__init__.py +++ b/bsfs/utils/__init__.py @@ -1,15 +1,10 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import typing # inner-module imports from . import errors -from .commons import typename +from .commons import typename, normalize_args from .uri import URI from .uuid import UUID, UCID @@ -19,6 +14,7 @@ __all__ : typing.Sequence[str] = ( 'URI', 'UUID', 'errors', + 'normalize_args', 'typename', ) diff --git a/bsfs/utils/commons.py b/bsfs/utils/commons.py index bad2fe0..a7092ae 100644 --- a/bsfs/utils/commons.py +++ b/bsfs/utils/commons.py @@ -1,14 +1,11 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports +from collections import abc import typing # exports __all__: typing.Sequence[str] = ( + 'normalize_args', 'typename', ) @@ -19,5 +16,37 @@ def typename(obj) -> str: """Return the type name of *obj*.""" return type(obj).__name__ +# argument type in `normalize_args`. +ArgType = typing.TypeVar('ArgType') # pylint: disable=invalid-name # type vars don't follow the usual convention + +def normalize_args( + *args: typing.Union[ArgType, typing.Iterable[ArgType], typing.Iterator[ArgType]] + ) -> typing.Tuple[ArgType, ...]: + """Arguments to a function can be passed as individual arguments, list-like + structures, or iterables. This function processes any of these styles and + returns a tuple of the respective items. Typically used within a function + provide a flexible interface but sill have parameters in a normalized form. + + Examples: + + >>> normalize_args(0,1,2) + (1,2,3) + >>> normalize_args([0,1,2]) + (1,2,3) + >>> normalize_args(range(3)) + (1,2,3) + + """ + if len(args) == 0: # foo() + return tuple() + if len(args) > 1: # foo(0, 1, 2) + return tuple(args) # type: ignore [arg-type] # we assume that argument styles (arg vs. iterable) are not mixed. + if isinstance(args[0], abc.Iterator): # foo(iter([0,1,2])) + return tuple(args[0]) + if isinstance(args[0], abc.Iterable) and not isinstance(args[0], str): # foo([0, 1, 2]) + return tuple(args[0]) + # foo(0) + return (args[0], ) # type: ignore [return-value] # if args[0] is a str, we assume that ArgType was str. + ## EOF ## diff --git a/bsfs/utils/errors.py b/bsfs/utils/errors.py index c5e8e16..b82e6e2 100644 --- a/bsfs/utils/errors.py +++ b/bsfs/utils/errors.py @@ -1,9 +1,4 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import typing @@ -38,4 +33,10 @@ class UnreachableError(ProgrammingError): class ConfigError(_BSFSError): """User config issue.""" +class BackendError(_BSFSError): + """Could not parse an AST structure.""" + +class UnsupportedError(_BSFSError): + """Some requested functionality is not supported by an implementation.""" + ## EOF ## diff --git a/bsfs/utils/uri.py b/bsfs/utils/uri.py index 84854a4..5755a6e 100644 --- a/bsfs/utils/uri.py +++ b/bsfs/utils/uri.py @@ -1,14 +1,11 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports import re import typing # constants +RX_CHARS = re.compile(r'[<>" {}|\\^]') + RX_URI = re.compile(r''' ^ (?:(?P<scheme>[^:/?#]+):)? # scheme, ://-delimited @@ -82,6 +79,9 @@ class URI(str): no claim about the validity of an URI! """ + # check characters + if RX_CHARS.search(query) is not None: + return False # check uri parts = RX_URI.match(query) if parts is not None: @@ -232,9 +232,6 @@ class URI(str): # overload formatting methods - def format(self, *args, **kwargs) -> 'URI': - return URI(super().format(*args, **kwargs)) - def __mod__(self, *args) -> 'URI': return URI(super().__mod__(*args)) diff --git a/bsfs/utils/uuid.py b/bsfs/utils/uuid.py index 6366b18..ad7fc1c 100644 --- a/bsfs/utils/uuid.py +++ b/bsfs/utils/uuid.py @@ -1,12 +1,9 @@ -""" -Part of the BlackStar filesystem (bsfs) module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" # imports from collections import abc import hashlib +import io +import json import os import platform import random @@ -105,4 +102,21 @@ class UCID(): with open(path, 'rb') as ifile: return HASH(ifile.read()).hexdigest() + @staticmethod + def from_buffer(buffer: io.IOBase) -> str: + """Read the content from a buffer.""" + if isinstance(buffer, io.TextIOBase): + return HASH(buffer.read().encode('utf-8', errors='ignore')).hexdigest() + return HASH(buffer.read()).hexdigest() + + @staticmethod + def from_bytes(content: bytes) -> str: + """Get the content from as bytes.""" + return HASH(content).hexdigest() + + @staticmethod + def from_dict(content: dict) -> str: + """Get the content from a dict.""" + return HASH(json.dumps(content).encode('ascii', 'ignore')).hexdigest() + ## EOF ## |