diff options
-rw-r--r-- | bsfs/utils/__init__.py | 5 | ||||
-rw-r--r-- | bsfs/utils/errors.py | 38 | ||||
-rw-r--r-- | bsfs/utils/uuid.py | 108 | ||||
-rw-r--r-- | test/utils/test_uuid.py | 92 | ||||
-rw-r--r-- | test/utils/testfile.t | 1 |
5 files changed, 244 insertions, 0 deletions
diff --git a/bsfs/utils/__init__.py b/bsfs/utils/__init__.py index 56a9323..94680ee 100644 --- a/bsfs/utils/__init__.py +++ b/bsfs/utils/__init__.py @@ -8,12 +8,17 @@ Author: Matthias Baumgartner, 2022 import typing # inner-module imports +from . import errors from .commons import typename from .uri import URI +from .uuid import UUID, UCID # exports __all__ : typing.Sequence[str] = ( + 'UCID', 'URI', + 'UUID', + 'errors', 'typename', ) diff --git a/bsfs/utils/errors.py b/bsfs/utils/errors.py new file mode 100644 index 0000000..04561a2 --- /dev/null +++ b/bsfs/utils/errors.py @@ -0,0 +1,38 @@ +""" + +Part of the BlackStar filesystem (bsfs) module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# exports +__all__: typing.Sequence[str] = ( + ) + + +## code ## + +class _BSFSError(Exception): + """Generic bsfs error.""" + +class SchemaError(_BSFSError): + """Generic schema errios.""" + +class ConsistencyError(SchemaError): + """A requested operation is inconsistent with the schema.""" + +class InstanceError(SchemaError): + """An instance affected by some operation is inconsistent with the schema.""" + +class PermissionDeniedError(_BSFSError): + """An operation was aborted due to access control restrictions.""" + +class ProgrammingError(_BSFSError): + """An assertion-like error that indicates a code-base issue.""" + +class UnreachableError(ProgrammingError): + """Bravo, you've reached a point in code that should logically not be reachable.""" + +## EOF ## diff --git a/bsfs/utils/uuid.py b/bsfs/utils/uuid.py new file mode 100644 index 0000000..7c39128 --- /dev/null +++ b/bsfs/utils/uuid.py @@ -0,0 +1,108 @@ +""" + +Part of the BlackStar filesystem (bsfs) module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +from collections import abc +import hashlib +import os +import platform +import random +import threading +import time +import typing +import uuid + +# constants +HASH = hashlib.sha256 + +# exports +__all__: typing.Sequence[str] = [ + 'UCID', + 'UUID', + ] + + +## code ## + +class UUID(abc.Iterator, abc.Callable): + """Generate 256-bit universally unique IDs. + + This is a 'best-effort' kind of implementation that tries to ensure global + uniqueness, even tough actual uniqueness cannot be guaranteed. + The approach is different from python's uuid module (which implements + RFC 4122) in that it generates longer UUIDs and in that it cannot be + reconstructed whether two UUIDs were generated on the same system. + + The ID is a cryptographic hash over several components: + * host + * system + * process + * thread + * random + * time + * cpu cycles + * content (if available) + + """ + + # host identifier + host: str + + # system identifier + system: str + + # process identifier + process: str + + # thread identifier + thread: str + + def __init__(self, seed: typing.Optional[int] = None): + # initialize static components + self.host = str(uuid.getnode()) + self.system = '-'.join(platform.uname()) + self.process = str(os.getpid()) + self.thread = str(threading.get_ident()) + # initialize random component + random.seed(seed) + + def __call__(self, content: typing.Optional[str] = None) -> str: + """Return a globally unique ID.""" + # content component + content = str(content) if content is not None else '' + # time component + now = str(time.time()) + # clock component + clk = str(time.perf_counter()) + # random component + rnd = str(random.random()) + # build the token from all available components + token = self.host + self.system + self.process + self.thread + rnd + now + clk + content + # return the token's hash + return HASH(token.encode('ascii', 'ignore')).hexdigest() + + def __iter__(self) -> typing.Iterator[str]: + """Iterate indefinitely over universally unique IDs.""" + return self + + def __next__(self) -> str: + """Generate universally unique IDs.""" + return self() + + +class UCID(abc.Callable): + """Generate 256-bit content IDs. + + Effectively computes a cryptographic hash over the content. + + """ + @staticmethod + def from_path(path: str) -> str: + """Read the content from a file.""" + with open(path, 'rb') as ifile: + return HASH(ifile.read()).hexdigest() + +## EOF ## diff --git a/test/utils/test_uuid.py b/test/utils/test_uuid.py new file mode 100644 index 0000000..49176d4 --- /dev/null +++ b/test/utils/test_uuid.py @@ -0,0 +1,92 @@ +""" + +Part of the tagit test suite. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import os +import re +import unittest + +# objects to test +from bsfs.utils.uuid import UUID, UCID + + +## code ## + +class TestUUID(unittest.TestCase): + """Test the UUID generator. + + The UUID is expected to generate random strings of 64 characters(0-9, A-F, case insensitive). + Due to the random nature of UUIDs, we cannot actually check if an uid is 'valid' besides + matching the expected format. + + At best, we can check if the number of collisions (values generated repeatedly) is below some + threshold. One would expect the number of collisions to increase with the number of generated uids. + Hence, we only perform an empirical test, whereas the exact test parameters (NUM_SAMPLES, + COLLISIONS_THRESHOLD) are subject to the application requirements. Note that this simple test + cannot replace a thorough statistical analysis. + + """ + + # expected uuid string format + _RX_FORMAT = re.compile('[0-9A-Fa-f]{64}') + + # number of uuids to generate for collisions test + _NUM_SAMPLES = 100_000 + + # number of permitted collisions (less-than test; exclusive) + _COLLISIONS_THRESHOLD = 2 # zero or one collisions to pass the test + + def _test_format(self, uid): + self.assertIsInstance(uid, str) + self.assertTrue(self._RX_FORMAT.fullmatch(uid) is not None) + + def test_call(self): + gen = UUID() + # w/o content + self._test_format(gen()) + # with content + self._test_format(gen('hello world')) + + def test_iter(self): + for _, uid in zip(range(1_000), iter(UUID())): + self._test_format(uid) + + def test_next(self): + gen = UUID() + for _ in range(1_000): + uid = next(gen) + self._test_format(uid) + + def test_collisions(self): + # generated uuids are reasonably unique. + # Note that we cannot guarantee no collisions. + uids = {uid for _, uid in zip(range(self._NUM_SAMPLES), UUID())} + self.assertGreater(len(uids), self._NUM_SAMPLES - self._COLLISIONS_THRESHOLD) + # uuids are reasonably unique across instances + uidA = {uid for _, uid in zip(range(self._NUM_SAMPLES), UUID())} + uidB = {uid for _, uid in zip(range(self._NUM_SAMPLES), UUID())} + self.assertLess(len(uidA & uidB), self._COLLISIONS_THRESHOLD) + # uuids are reasonably unique despite identical seeds. + uidA = {uid for _, uid in zip(range(self._NUM_SAMPLES), UUID(seed=123))} + uidB = {uid for _, uid in zip(range(self._NUM_SAMPLES), UUID(seed=123))} + self.assertLess(len(uidA & uidB), self._COLLISIONS_THRESHOLD) + + +class TestUCID(unittest.TestCase): + def setUp(self): + self._checksum = 'a948904f2f0f479b8f8197694b30184b0d2ed1c1cd2a1ec0fb85d299a192a447' # sha256 + self._path = os.path.join(os.path.dirname(__file__), 'testfile.t') + + def test_from_path(self): + self.assertEqual(UCID.from_path(self._path), self._checksum) + + +## main ## + +if __name__ == '__main__': + unittest.main() + +## EOF ## diff --git a/test/utils/testfile.t b/test/utils/testfile.t new file mode 100644 index 0000000..3b18e51 --- /dev/null +++ b/test/utils/testfile.t @@ -0,0 +1 @@ +hello world |