From 28a021483c13e974e00b6159f0653b0727df9d10 Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Thu, 2 Mar 2023 16:40:00 +0100 Subject: prohibit certain characters in URI and ensure URIs in bsfs.graph --- bsfs/graph/nodes.py | 5 ++--- bsfs/schema/types.py | 2 +- bsfs/triple_store/sparql/parse_filter.py | 6 ++---- bsfs/triple_store/sparql/sparql.py | 10 ++-------- bsfs/utils/uri.py | 8 +++++--- test/graph/test_nodes.py | 19 ++++++++++++------- test/query/ast_test/test_filter_.py | 2 +- test/triple_store/sparql/test_parse_filter.py | 2 +- test/triple_store/sparql/test_sparql.py | 4 ++-- test/utils/test_uri.py | 19 ++++++++++++++----- 10 files changed, 42 insertions(+), 35 deletions(-) diff --git a/bsfs/graph/nodes.py b/bsfs/graph/nodes.py index c3530c1..84996c7 100644 --- a/bsfs/graph/nodes.py +++ b/bsfs/graph/nodes.py @@ -52,9 +52,8 @@ class Nodes(): self._backend = backend self._ac = access_control self._node_type = node_type - self._guids = set(guids) - # create helper instances - # FIXME: Assumes that the schema does not change while the instance is in use! + # convert to URI since this is not guaranteed by Graph + self._guids = {URI(guid) for guid in guids} def __eq__(self, other: typing.Any) -> bool: return isinstance(other, Nodes) \ diff --git a/bsfs/schema/types.py b/bsfs/schema/types.py index 54adffb..104580d 100644 --- a/bsfs/schema/types.py +++ b/bsfs/schema/types.py @@ -98,7 +98,7 @@ class _Type(): parent: typing.Optional['_Type'] = None, **annotations: typing.Any, ): - self.uri = uri + self.uri = URI(uri) self.parent = parent self.annotations = annotations diff --git a/bsfs/triple_store/sparql/parse_filter.py b/bsfs/triple_store/sparql/parse_filter.py index 8959b2c..bf19a02 100644 --- a/bsfs/triple_store/sparql/parse_filter.py +++ b/bsfs/triple_store/sparql/parse_filter.py @@ -154,7 +154,7 @@ class Filter(): puri = f'<{puri}>' # type: ignore [assignment] # variable re-use confuses mypy # apply reverse flag if node.reverse: - puri = URI('^' + puri) + puri = '^' + puri dom, rng = rng, dom # type: ignore [assignment] # variable re-use confuses mypy # check path consistency if not node_type <= dom: @@ -267,9 +267,7 @@ class Filter(): """ if not isinstance(node_type, bsc.Node): raise errors.BackendError(f'expected Node, found {node_type}') - if not rdflib.term._is_valid_uri(node.value): # pylint: disable=protected-access - raise errors.BackendError(f'<{node.value}> is not a serializable uri') - return f'VALUES {head} {{ <{node.value}> }}' + return f'VALUES {head} {{ <{URI(node.value)}> }}' def _equals(self, node_type: bsc.Vertex, node: ast.filter.Equals, head: str) -> str: """ diff --git a/bsfs/triple_store/sparql/sparql.py b/bsfs/triple_store/sparql/sparql.py index bd98f46..68c0027 100644 --- a/bsfs/triple_store/sparql/sparql.py +++ b/bsfs/triple_store/sparql/sparql.py @@ -284,10 +284,7 @@ class SparqlStore(base.TripleStoreBase): raise errors.ConsistencyError(f'{node_type} is not defined in the schema') # check and create guids for guid in guids: - # check convert to rdflib.URIRef - if not rdflib.term._is_valid_uri(guid): # pylint: disable=protected-access - raise ValueError(guids) - subject = rdflib.URIRef(guid) + subject = rdflib.URIRef(URI(guid)) # check node existence if (subject, rdflib.RDF.type, None) in self._graph: # FIXME: node exists and may have a different type! ignore? raise? report? @@ -326,10 +323,7 @@ class SparqlStore(base.TripleStoreBase): raise errors.InstanceError(inconsistent) # check guids # FIXME: Fail or skip inexistent nodes? - guids = set(guids) - invalid = {guid for guid in guids if not rdflib.term._is_valid_uri(guid)} # pylint: disable=protected-access - if len(invalid) > 0: - raise ValueError(invalid) + guids = {URI(guid) for guid in guids} inconsistent = {guid for guid in guids if not self._has_type(guid, node_type)} if len(inconsistent) > 0: raise errors.InstanceError(inconsistent) diff --git a/bsfs/utils/uri.py b/bsfs/utils/uri.py index 0693017..5755a6e 100644 --- a/bsfs/utils/uri.py +++ b/bsfs/utils/uri.py @@ -4,6 +4,8 @@ import re import typing # constants +RX_CHARS = re.compile(r'[<>" {}|\\^]') + RX_URI = re.compile(r''' ^ (?:(?P[^:/?#]+):)? # scheme, ://-delimited @@ -77,6 +79,9 @@ class URI(str): no claim about the validity of an URI! """ + # check characters + if RX_CHARS.search(query) is not None: + return False # check uri parts = RX_URI.match(query) if parts is not None: @@ -227,9 +232,6 @@ class URI(str): # overload formatting methods - def format(self, *args, **kwargs) -> 'URI': - return URI(super().format(*args, **kwargs)) - def __mod__(self, *args) -> 'URI': return URI(super().__mod__(*args)) diff --git a/test/graph/test_nodes.py b/test/graph/test_nodes.py index 083b2d8..dca887a 100644 --- a/test/graph/test_nodes.py +++ b/test/graph/test_nodes.py @@ -123,15 +123,19 @@ class TestNodes(unittest.TestCase): URI('http://example.com/me/tag#4321'), } + def test_construct(self): + self.assertIsInstance(Nodes(self.backend, self.ac, self.ent_type, {'http://example.com/me-and-you'}), Nodes) + self.assertRaises(ValueError, Nodes, self.backend, self.ac, self.ent_type, {'http://example.com/me and you'}) + def test_str(self): # str baseline - nodes = Nodes(self.backend, self.ac, self.ent_type, self.ent_ids) - self.assertEqual(str(nodes), f'Nodes({self.ent_type}, {self.ent_ids})') - self.assertEqual(repr(nodes), f'Nodes({self.backend}, {self.ac}, {self.ent_type}, {self.ent_ids})') + nodes = Nodes(self.backend, self.ac, self.ent_type, {URI('http://example.com/me/entity#1234')}) + self.assertEqual(str(nodes), f"Nodes({self.ent_type}, {{'http://example.com/me/entity#1234'}})") + self.assertEqual(repr(nodes), f"Nodes({self.backend}, {self.ac}, {self.ent_type}, {{'http://example.com/me/entity#1234'}})") # str respects node_type - nodes = Nodes(self.backend, self.ac, self.tag_type, self.tag_ids) - self.assertEqual(str(nodes), f'Nodes({self.tag_type}, {self.tag_ids})') - self.assertEqual(repr(nodes), f'Nodes({self.backend}, {self.ac}, {self.tag_type}, {self.tag_ids})') + nodes = Nodes(self.backend, self.ac, self.tag_type, {URI('http://example.com/me/tag#1234')}) + self.assertEqual(str(nodes), f"Nodes({self.tag_type}, {{'http://example.com/me/tag#1234'}})") + self.assertEqual(repr(nodes), f"Nodes({self.backend}, {self.ac}, {self.tag_type}, {{'http://example.com/me/tag#1234'}})") # str respects guids nodes = Nodes(self.backend, self.ac, self.ent_type, {URI('http://example.com/me/entity#foo')}) self.assertEqual(str(nodes), f'Nodes({self.ent_type}, {{\'http://example.com/me/entity#foo\'}})') @@ -437,7 +441,8 @@ class TestNodes(unittest.TestCase): self.assertRaises(TypeError, nodes.get, 1234) self.assertRaises(TypeError, nodes.get, (ns.bse.tag, 1234)) self.assertRaises(TypeError, nodes.get, (1234, ns.bse.tag)) - self.assertRaises(errors.ConsistencyError, nodes.get, 'hello world') + self.assertRaises(ValueError, nodes.get, 'hello world') + self.assertRaises(errors.ConsistencyError, nodes.get, 'hello_world') self.assertRaises(errors.ConsistencyError, nodes.get, ns.bse.invalid) self.assertRaises(errors.ConsistencyError, nodes.get, (ns.bse.tag, bst.invalid)) # can pass multiple paths diff --git a/test/query/ast_test/test_filter_.py b/test/query/ast_test/test_filter_.py index cdc530c..d0d42ea 100644 --- a/test/query/ast_test/test_filter_.py +++ b/test/query/ast_test/test_filter_.py @@ -382,7 +382,7 @@ class TestPredicate(unittest.TestCase): # member returns predicate # predicate must be an URI self.assertEqual(Predicate(ns.bse.filesize).predicate, ns.bse.filesize) - self.assertEqual(Predicate(URI('hello world')).predicate, URI('hello world')) + self.assertEqual(Predicate(URI('hello_world')).predicate, URI('hello_world')) self.assertRaises(TypeError, Predicate, 1234) self.assertRaises(TypeError, Predicate, FilterExpression()) self.assertRaises(TypeError, Predicate, FilterExpression()) diff --git a/test/triple_store/sparql/test_parse_filter.py b/test/triple_store/sparql/test_parse_filter.py index 6db9224..5b6ca8a 100644 --- a/test/triple_store/sparql/test_parse_filter.py +++ b/test/triple_store/sparql/test_parse_filter.py @@ -158,7 +158,7 @@ class TestParseFilter(unittest.TestCase): # _is requires a node self.assertRaises(errors.BackendError, self.parser._is, self.schema.literal(ns.bsfs.Literal), ast.filter.Is('http://example.com/entity#1234'), '?ent') # _is requires a serializable guid - self.assertRaises(errors.BackendError, self.parser._is, self.schema.node(ns.bsfs.Entity), ast.filter.Is('http://example.com/entity#foo and bar'), '?ent') + self.assertRaises(ValueError, self.parser._is, self.schema.node(ns.bsfs.Entity), ast.filter.Is('http://example.com/entity#foo and bar'), '?ent') # a single Is statement q = self.parser(self.schema.node(ns.bsfs.Entity), ast.filter.Is('http://example.com/entity#1234')) self.assertSetEqual({str(guid) for guid, in q(self.graph)}, diff --git a/test/triple_store/sparql/test_sparql.py b/test/triple_store/sparql/test_sparql.py index d880082..f45ca37 100644 --- a/test/triple_store/sparql/test_sparql.py +++ b/test/triple_store/sparql/test_sparql.py @@ -679,7 +679,7 @@ class TestSparqlStore(unittest.TestCase): URI('http://example.com/me/entity#1234'), URI('http://example.com/me/entity#4321')}) # guid must be valid - self.assertRaises(ValueError, store.create, self.schema.node(ns.bsfs.Entity), {URI('http://example.com/me/foo and bar')}) + self.assertRaises(ValueError, store.create, self.schema.node(ns.bsfs.Entity), {'http://example.com/me/foo and bar'}) # can create some nodes ent_type = store.schema.node(ns.bsfs.Entity) @@ -770,7 +770,7 @@ class TestSparqlStore(unittest.TestCase): self.assertRaises(errors.ConsistencyError, store.set, ent_type, ent_ids, p_invalid, {'http://example.com/me/tag#1234'}) # invalid guid is not permitted - self.assertRaises(ValueError, store.set, ent_type, {URI('http://example.com/me/foo and bar')}, p_filesize, {1234}) + self.assertRaises(ValueError, store.set, ent_type, {'http://example.com/me/foo and bar'}, p_filesize, {1234}) # predicate must match node_type self.assertRaises(errors.ConsistencyError, store.set, tag_type, tag_ids, p_filesize, {1234}) diff --git a/test/utils/test_uri.py b/test/utils/test_uri.py index 6ee2ef7..1c4c9f9 100644 --- a/test/utils/test_uri.py +++ b/test/utils/test_uri.py @@ -35,6 +35,16 @@ class TestURI(unittest.TestCase): self.assertTrue(URI.is_parseable('telnet://192.0.2.16:80/')) self.assertTrue(URI.is_parseable('urn:oasis:names:specification:docbook:dtd:xml:4.1.2')) + # some characters are prohibited + self.assertFalse(URI.is_parseable('http://example.com/foobar')) + self.assertFalse(URI.is_parseable('http://example.com/foo bar')) + self.assertFalse(URI.is_parseable('http://example.com/foo{bar')) + self.assertFalse(URI.is_parseable('http://example.com/foo}bar')) + self.assertFalse(URI.is_parseable('http://example.com/foo|bar')) + self.assertFalse(URI.is_parseable('http://example.com/foo^bar')) + self.assertFalse(URI.is_parseable('http://example.com/foo\\bar')) + # uri cannot end with a scheme delimiter self.assertFalse(URI.is_parseable('http://')) # port must be a number @@ -159,10 +169,10 @@ class TestURI(unittest.TestCase): def test_overloaded(self): # composition - self.assertIsInstance(URI('http://user@www.example.com:1234/{}/path1?{}#fragment') + 'hello', URI) - self.assertIsInstance(URI('http://user@www.example.com:1234/{}/path1?{}#fragment') * 2, URI) - self.assertIsInstance(2 * URI('http://user@www.example.com:1234/{}/path1?{}#fragment'), URI) # rmul - self.assertIsInstance(URI('http://user@www.example.com:1234/{}/path1?{}#fragment').join(['hello', 'world']) , URI) + self.assertIsInstance(URI('http://user@www.example.com:1234/path0/path1?query#fragment') + 'hello', URI) + self.assertIsInstance(URI('http://user@www.example.com:1234/path0/path1?query#fragment') * 2, URI) + self.assertIsInstance(2 * URI('http://user@www.example.com:1234/path0/path1?query#fragment'), URI) # rmul + self.assertIsInstance(URI('http://user@www.example.com:1234/path0/path1?query#fragment').join(['hello', 'world']) , URI) # stripping self.assertIsInstance(URI('http://user@www.example.com:1234/path0/path1?query#fragment').strip(), URI) self.assertIsInstance(URI('http://user@www.example.com:1234/path0/path1?query#fragment').lstrip(), URI) @@ -171,7 +181,6 @@ class TestURI(unittest.TestCase): self.assertIsInstance(URI('http://user@www.example.com:1234/path0/path1?query#fragment').lower(), URI) self.assertIsInstance(URI('http://user@www.example.com:1234/path0/path1?query#fragment').upper(), URI) # formatting - self.assertIsInstance(URI('http://user@www.example.com:1234/{}/path1?{}#fragment').format('hello', 'world'), URI) self.assertIsInstance(URI('http://user@www.example.com:1234/%s/path1?%s#fragment') % ('hello', 'world'), URI) self.assertIsInstance(URI('http://user@www.example.com:1234/path0/path1?query#fragment').replace('path0', 'pathX'), URI) -- cgit v1.2.3