From ed2074ae88f2db6cb6b38716b43b35e29eb2e16c Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Fri, 23 Dec 2022 16:25:51 +0100 Subject: filematcher: check file properties, formulate them as a string --- bsie/utils/__init__.py | 2 + bsie/utils/filematcher/__init__.py | 20 +++++ bsie/utils/filematcher/matcher.py | 177 +++++++++++++++++++++++++++++++++++++ bsie/utils/filematcher/parser.py | 148 +++++++++++++++++++++++++++++++ 4 files changed, 347 insertions(+) create mode 100644 bsie/utils/filematcher/__init__.py create mode 100644 bsie/utils/filematcher/matcher.py create mode 100644 bsie/utils/filematcher/parser.py (limited to 'bsie/utils') diff --git a/bsie/utils/__init__.py b/bsie/utils/__init__.py index bd22236..3981dc7 100644 --- a/bsie/utils/__init__.py +++ b/bsie/utils/__init__.py @@ -11,9 +11,11 @@ import typing from . import bsfs from . import namespaces as ns from . import node +from . import filematcher # exports __all__: typing.Sequence[str] = ( + 'filematcher', 'bsfs', 'node', 'ns', diff --git a/bsie/utils/filematcher/__init__.py b/bsie/utils/filematcher/__init__.py new file mode 100644 index 0000000..b1c1b45 --- /dev/null +++ b/bsie/utils/filematcher/__init__.py @@ -0,0 +1,20 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from .matcher import Matcher +from .parser import parse + +# exports +__all__: typing.Sequence[str] = ( + 'Matcher', + 'parse', + ) + +## EOF ## diff --git a/bsie/utils/filematcher/matcher.py b/bsie/utils/filematcher/matcher.py new file mode 100644 index 0000000..164beeb --- /dev/null +++ b/bsie/utils/filematcher/matcher.py @@ -0,0 +1,177 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2021 +""" +# imports +from collections.abc import Callable, Collection, Hashable +import abc +import os +import typing +import magic + +# exports +__all__: typing.Sequence[str] = [] + + +## code ## + +# abstract nodes + +class Matcher(abc.ABC, Hashable, Callable, Collection): # type: ignore [misc] # Invalid base class Callable + """Matcher node base class.""" + + # child expressions or terminals + _childs: typing.Set[typing.Any] + + def __init__(self, *childs: typing.Any): + if len(childs) == 1 and isinstance(childs[0], (list, tuple, set)): + self._childs = set(childs[0]) + else: + self._childs = set(childs) + + def __contains__(self, needle: typing.Any) -> bool: + return needle in self._childs + + def __iter__(self) -> typing.Iterator[typing.Any]: + return iter(self._childs) + + def __len__(self) -> int: + return len(self._childs) + + def __repr__(self) -> str: + return f'{type(self).__name__}({self._childs})' + + def __hash__(self) -> int: + return hash((type(self), tuple(set(self._childs)))) + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) \ + and self._childs == other._childs + + @abc.abstractmethod + def __call__(self, path: str) -> bool: # pylint: disable=arguments-differ + """Check if *path* satisfies the conditions set by the Matcher instance.""" + +class NOT(Matcher): + """Invert a matcher result.""" + def __init__(self, expr: Matcher): + super().__init__(expr) + def __call__(self, path: str) -> bool: + return not next(iter(self._childs))(path) + +# aggregate nodes + +class Aggregate(Matcher): # pylint: disable=too-few-public-methods # Yeah, it's an interface... + """Aggregation function base class (And, Or).""" + +class And(Aggregate): + """Accept only if all conditions are satisfied.""" + def __call__(self, path: str) -> bool: + for itm in self: + if not itm(path): + return False + return True + +class Or(Aggregate): + """Accept only if at least one condition is satisfied.""" + def __call__(self, path: str) -> bool: + for itm in self: + if itm(path): + return True + return False + + +# criteria nodes + +class Criterion(Matcher): + """Criterion base class. Limits acceptance to certain values.""" + def accepted(self) -> typing.Set[typing.Any]: + """Return a set of accepted values.""" + return self._childs + +# criteria w/o value (valueless) + +class Any(Criterion): + """Accepts anything.""" + def __call__(self, path: str) -> bool: + return True + +class Nothing(Criterion): + """Accepts nothing.""" + def __call__(self, path: str) -> bool: + return False + +class Exists(Criterion): + """Filters by existence.""" + def __call__(self, path: str) -> bool: + return os.path.exists(path) + +class IsFile(Criterion): + """Checks if the path is a regular file.""" + def __call__(self, path: str) -> bool: + return os.path.isfile(path) + +class IsDir(Criterion): + """Checks if the path is a directory.""" + def __call__(self, path: str) -> bool: + return os.path.isdir(path) + +class IsLink(Criterion): + """Checks if the path is a link.""" + def __call__(self, path: str) -> bool: + return os.path.islink(path) + +class IsAbs(Criterion): + """Checks if the path is an absolute path.""" + def __call__(self, path: str) -> bool: + return os.path.isabs(path) + +class IsRel(Criterion): + """Checks if the path is a relative path.""" + def __call__(self, path: str) -> bool: + return not os.path.isabs(path) + +class IsMount(Criterion): + """Checks if the path is a mount point.""" + def __call__(self, path: str) -> bool: + return os.path.ismount(path) + +class IsEmpty(Criterion): + """Checks if the path is an empty file.""" + def __call__(self, path: str) -> bool: + return os.path.exists(path) and os.stat(path).st_size == 0 + +class IsReadable(Criterion): + """Checks if the path is readable.""" + def __call__(self, path: str) -> bool: + return os.path.exists(path) and os.access(path, os.R_OK) + +class IsWritable(Criterion): + """Checks if the path is writable.""" + def __call__(self, path: str) -> bool: + return os.path.exists(path) and os.access(path, os.W_OK) + +class IsExecutable(Criterion): + """Checks if the path is executable.""" + def __call__(self, path: str) -> bool: + return os.path.exists(path) and os.access(path, os.X_OK) + +# criteria w/ value + +class Extension(Criterion): + """Filters by file extension (without the dot).""" + def __call__(self, path: str) -> bool: + _, ext = os.path.splitext(path) + return ext[1:] in self.accepted() + +class Mime(Criterion): + """Filters by mime type.""" + def __call__(self, path: str) -> bool: + try: + return magic.from_file(path, mime=True).lower() in self.accepted() + except FileNotFoundError: + return False + +## EOF ## diff --git a/bsie/utils/filematcher/parser.py b/bsie/utils/filematcher/parser.py new file mode 100644 index 0000000..0654742 --- /dev/null +++ b/bsie/utils/filematcher/parser.py @@ -0,0 +1,148 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2021 +""" +# standard imports +import typing + +# non-standard imports +import pyparsing +from pyparsing import printables, alphas8bit, punc8bit, QuotedString, Word, \ + delimitedList, Or, CaselessKeyword, Group, oneOf, Optional + +# bsie imports +from bsie.base import errors + +# inner-module imports +from . import matcher + +# exports +__all__: typing.Sequence[str] = ( + 'parse', + ) + + +## code ## + +class FileMatcherParser(): + """ + EXPR := RULES | RULES "|" RULES + RULESET := RULE | RULE, RULE + RULE := CRITERION OP VALUE | CRITERION OP {VALUES} | VALUELESS + OP := != | = + VALUES := VALUE | VALUE, VALUE + VALUE := [word] + CRITERION := mime | extension | ... + """ + + # criteria matcher nodes w/ arguments + _CRITERIA: typing.Dict[str, typing.Type[matcher.Matcher]] = { + 'extension': matcher.Extension, + 'mime': matcher.Mime, + } + + # criteria matcher nodes w/o arguments + _VALUELESS: typing.Dict[str, typing.Type[matcher.Matcher]] = { + 'any': matcher.Any, + 'nothing': matcher.Nothing, + 'exists': matcher.Exists, + 'isfile': matcher.IsFile, + 'isdir': matcher.IsDir, + 'islink': matcher.IsLink, + 'isabs': matcher.IsAbs, + 'isrel': matcher.IsRel, + 'ismount': matcher.IsMount, + 'emtpy': matcher.IsEmpty, + 'readable': matcher.IsReadable, + 'writable': matcher.IsWritable, + 'executable': matcher.IsExecutable, + } + + # pyparsing parser instance. + _parser: pyparsing.ParseExpression + + def __init__(self): + # build the parser + # VALUE := [word] + alphabet = (printables + alphas8bit + punc8bit).translate(str.maketrans('', '', ',{}|=')) + value = QuotedString(quoteChar='"', escChar='\\') ^ Word(alphabet) + # CRITERION := mime | extension | ... + criterion = Or([CaselessKeyword(p) for p in self._CRITERIA]).setResultsName('criterion') + valueless = Or([CaselessKeyword(p) for p in self._VALUELESS]).setResultsName('criterion') + # VALUES := VALUE | VALUE, VALUE + values = delimitedList(value, delim=',').setResultsName('value') + # OP := '=' | '!=' + eqop = oneOf('= !=').setResultsName('op') + # RULE := CRITERION OP VALUE | CRITERION OP {VALUES} | VALUELESS + rule_none = Group(Optional('!').setResultsName('op') + valueless).setResultsName('rule_none') + rule_one = Group(criterion + eqop + value.setResultsName('value')).setResultsName('rule_one') + rule_few = Group(criterion + eqop + '{' + values + '}').setResultsName('rule_few') + # RULESET := RULE | RULE, RULE + ruleset = Group(delimitedList(rule_none ^ rule_one ^ rule_few, delim=',')) + # EXPR := RULESET | RULESET \| RULESET + self._parser = delimitedList(ruleset, delim='|') + + def parse(self, query: str) -> matcher.Matcher: # pylint: disable=too-many-branches + """Build a file matcher from a rule definition.""" + # preprocess the query + query = query.strip() + + # empty query + if len(query) == 0: + return matcher.Any() + + try: + parsed = self._parser.parseString(query, parseAll=True) + except pyparsing.ParseException as err: + raise errors.ParserError(f'Cannot parse query {err}') + + # convert to Matcher + rules = [] + for exp in parsed: + tokens = [] + for rule in exp: + # fetch accepted values + if rule.getName() == 'rule_none': + accepted = [] + elif rule.getName() == 'rule_one': + accepted = [rule.value] + elif rule.getName() == 'rule_few': + accepted = list(rule.value) + else: # prevented by grammar + raise errors.UnreachableError('Invalid rule definition') + + # build criterion + if rule.criterion in self._VALUELESS: + cls = self._VALUELESS[rule.criterion] + if rule.op == '!': + tokens.append(matcher.NOT(cls())) + else: + tokens.append(cls()) + elif rule.criterion in self._CRITERIA: + cls = self._CRITERIA[rule.criterion] + if rule.op == '!=': + tokens.append(matcher.NOT(cls(accepted))) + else: + tokens.append(cls(accepted)) + else: # prevented by grammar + raise errors.UnreachableError(f'Invalid condition "{rule.criterion}"') + + # And-aggregate rules in one ruleset (if needed) + tokens = matcher.And(tokens) if len(tokens) > 1 else tokens[0] + rules.append(tokens) + + # Or-aggregate rulesets + expr = matcher.Or(rules) if len(rules) > 1 else rules[0] + + return expr + +# build default instance +file_match_parser = FileMatcherParser() + +def parse(query: str) -> matcher.Matcher: + """Shortcut for FileMatcherParser()(query).""" + return file_match_parser.parse(query) + +## EOF ## -- cgit v1.2.3 From 266c2c9a072bf3289fd7f2d75278b7d59528378c Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Sat, 24 Dec 2022 10:27:09 +0100 Subject: package restructuring: base * Reader and Extractor to respective reader/extractor modules * ReaderBuilder to reader module * ExtractorBuilder to extractor module * Loading module in utils (safe_load, unpack_name) * Pipeline and PipelineBuilder to lib module * errors to utils * documentation: "standard import" and "external import" --- bsie/utils/__init__.py | 9 ++++--- bsie/utils/errors.py | 45 +++++++++++++++++++++++++++++++++ bsie/utils/filematcher/parser.py | 6 ++--- bsie/utils/loading.py | 54 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 107 insertions(+), 7 deletions(-) create mode 100644 bsie/utils/errors.py create mode 100644 bsie/utils/loading.py (limited to 'bsie/utils') diff --git a/bsie/utils/__init__.py b/bsie/utils/__init__.py index 3981dc7..9cb60ed 100644 --- a/bsie/utils/__init__.py +++ b/bsie/utils/__init__.py @@ -4,21 +4,24 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # inner-module imports from . import bsfs +from . import filematcher from . import namespaces as ns from . import node -from . import filematcher +from .loading import safe_load, unpack_qualified_name # exports __all__: typing.Sequence[str] = ( - 'filematcher', 'bsfs', + 'filematcher', 'node', 'ns', + 'safe_load', + 'unpack_qualified_name', ) ## EOF ## diff --git a/bsie/utils/errors.py b/bsie/utils/errors.py new file mode 100644 index 0000000..5fafd5b --- /dev/null +++ b/bsie/utils/errors.py @@ -0,0 +1,45 @@ +"""Common BSIE exceptions. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# exports +__all__: typing.Sequence[str] = ( + 'BuilderError', + 'ExtractorError', + 'LoaderError', + 'ReaderError', + ) + + +## code ## + +class _BSIEError(Exception): + """Generic BSIE error.""" + +class BuilderError(_BSIEError): + """The Builder failed to create an instance.""" + +class LoaderError(BuilderError): + """Failed to load a module or class.""" + +class ExtractorError(_BSIEError): + """The Extractor failed to process the given content.""" + +class ReaderError(_BSIEError): + """The Reader failed to read the given file.""" + +class ProgrammingError(_BSIEError): + """An assertion-like error that indicates a code-base issue.""" + +class UnreachableError(ProgrammingError): + """Bravo, you've reached a point in code that should logically not be reachable.""" + +class ParserError(_BSIEError): + """Failed to parse due to invalid syntax or structures.""" + +## EOF ## diff --git a/bsie/utils/filematcher/parser.py b/bsie/utils/filematcher/parser.py index 0654742..2f82875 100644 --- a/bsie/utils/filematcher/parser.py +++ b/bsie/utils/filematcher/parser.py @@ -7,16 +7,14 @@ Author: Matthias Baumgartner, 2021 # standard imports import typing -# non-standard imports +# external imports import pyparsing from pyparsing import printables, alphas8bit, punc8bit, QuotedString, Word, \ delimitedList, Or, CaselessKeyword, Group, oneOf, Optional -# bsie imports -from bsie.base import errors - # inner-module imports from . import matcher +from .. import errors # exports __all__: typing.Sequence[str] = ( diff --git a/bsie/utils/loading.py b/bsie/utils/loading.py new file mode 100644 index 0000000..eb05c35 --- /dev/null +++ b/bsie/utils/loading.py @@ -0,0 +1,54 @@ +""" + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# standard imports +import importlib +import typing + +# inner-module imports +from . import errors + +# exports +__all__: typing.Sequence[str] = ( + 'safe_load', + 'unpack_qualified_name', + ) + + +## code ## + +def safe_load(module_name: str, class_name: str): + """Get a class from a module. Raise BuilderError if anything goes wrong.""" + try: + # load the module + module = importlib.import_module(module_name) + except Exception as err: + # cannot import module + raise errors.LoaderError(f'cannot load module {module_name}') from err + + try: + # get the class from the module + cls = getattr(module, class_name) + except Exception as err: + # cannot find the class + raise errors.LoaderError(f'cannot load class {class_name} from module {module_name}') from err + + return cls + + +def unpack_qualified_name(name): + """Split a name into its module and class component (dot-separated).""" + if not isinstance(name, str): + raise TypeError(name) + if '.' not in name: + raise ValueError('name must be a qualified class name.') + module_name, class_name = name[:name.rfind('.')], name[name.rfind('.')+1:] + if module_name == '': + raise ValueError('name must be a qualified class name.') + return module_name, class_name + + +## EOF ## -- cgit v1.2.3 From 07219685d01f803dc46c8d5465fa542c1d822cb4 Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Sat, 24 Dec 2022 10:39:51 +0100 Subject: documentation: standard vs external import --- bsie/utils/bsfs.py | 2 +- bsie/utils/errors.py | 2 +- bsie/utils/filematcher/__init__.py | 2 +- bsie/utils/filematcher/matcher.py | 4 +++- bsie/utils/namespaces.py | 2 +- bsie/utils/node.py | 2 +- 6 files changed, 8 insertions(+), 6 deletions(-) (limited to 'bsie/utils') diff --git a/bsie/utils/bsfs.py b/bsie/utils/bsfs.py index 0b88479..ef5db31 100644 --- a/bsie/utils/bsfs.py +++ b/bsie/utils/bsfs.py @@ -4,7 +4,7 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # bsfs imports diff --git a/bsie/utils/errors.py b/bsie/utils/errors.py index 5fafd5b..fbc16f7 100644 --- a/bsie/utils/errors.py +++ b/bsie/utils/errors.py @@ -4,7 +4,7 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # exports diff --git a/bsie/utils/filematcher/__init__.py b/bsie/utils/filematcher/__init__.py index b1c1b45..1e23e4e 100644 --- a/bsie/utils/filematcher/__init__.py +++ b/bsie/utils/filematcher/__init__.py @@ -4,7 +4,7 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # inner-module imports diff --git a/bsie/utils/filematcher/matcher.py b/bsie/utils/filematcher/matcher.py index 164beeb..a279a4b 100644 --- a/bsie/utils/filematcher/matcher.py +++ b/bsie/utils/filematcher/matcher.py @@ -4,11 +4,13 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2021 """ -# imports +# standard imports from collections.abc import Callable, Collection, Hashable import abc import os import typing + +# external imports import magic # exports diff --git a/bsie/utils/namespaces.py b/bsie/utils/namespaces.py index a29fc1b..2d0b535 100644 --- a/bsie/utils/namespaces.py +++ b/bsie/utils/namespaces.py @@ -4,7 +4,7 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # inner-module imports diff --git a/bsie/utils/node.py b/bsie/utils/node.py index ecf39cd..91e4f37 100644 --- a/bsie/utils/node.py +++ b/bsie/utils/node.py @@ -4,7 +4,7 @@ Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ -# imports +# standard imports import typing # bsie imports -- cgit v1.2.3 From 5d7fa2716009bc32c08f27e686cd92ca4c02b670 Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Mon, 16 Jan 2023 14:38:01 +0100 Subject: colors spatial feature --- bsie/utils/namespaces.py | 1 + 1 file changed, 1 insertion(+) (limited to 'bsie/utils') diff --git a/bsie/utils/namespaces.py b/bsie/utils/namespaces.py index 2d0b535..393b436 100644 --- a/bsie/utils/namespaces.py +++ b/bsie/utils/namespaces.py @@ -15,6 +15,7 @@ bse = _bsfs.Namespace('http://bsfs.ai/schema/Entity') bsfs = _bsfs.Namespace('http://bsfs.ai/schema', fsep='/') bsm = _bsfs.Namespace('http://bsfs.ai/schema/Meta') xsd = _bsfs.Namespace('http://www.w3.org/2001/XMLSchema') +bsf = _bsfs.Namespace('http://ie.bsfs.ai/schema/Feature') # export __all__: typing.Sequence[str] = ( -- cgit v1.2.3 From afd165000c1661a9cca117a4844ad3f89d926fdb Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Mon, 16 Jan 2023 20:53:39 +0100 Subject: unsupported file format exception --- bsie/utils/errors.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'bsie/utils') diff --git a/bsie/utils/errors.py b/bsie/utils/errors.py index fbc16f7..8133cd4 100644 --- a/bsie/utils/errors.py +++ b/bsie/utils/errors.py @@ -42,4 +42,7 @@ class UnreachableError(ProgrammingError): class ParserError(_BSIEError): """Failed to parse due to invalid syntax or structures.""" +class UnsupportedFileFormatError(ReaderError): + """Failed to read a file format.""" + ## EOF ## -- cgit v1.2.3