aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.coveragerc15
-rw-r--r--.mypy.ini3
-rw-r--r--.pylintrc193
-rw-r--r--README51
-rwxr-xr-xbsie.app49
-rw-r--r--bsie.toml11
-rw-r--r--bsie/__init__.py18
-rw-r--r--bsie/apps/__init__.py20
-rw-r--r--bsie/apps/index.py121
-rw-r--r--bsie/apps/info.py74
-rw-r--r--bsie/base/__init__.py24
-rw-r--r--bsie/base/errors.py42
-rw-r--r--bsie/base/extractor.py103
-rw-r--r--bsie/base/reader.py47
-rw-r--r--bsie/extractor/__init__.py15
-rw-r--r--bsie/extractor/generic/__init__.py16
-rw-r--r--bsie/extractor/generic/constant.py57
-rw-r--r--bsie/extractor/generic/path.py74
-rw-r--r--bsie/extractor/generic/stat.py70
-rw-r--r--bsie/lib/__init__.py18
-rw-r--r--bsie/lib/bsie.py92
-rw-r--r--bsie/reader/__init__.py19
-rw-r--r--bsie/reader/path.py28
-rw-r--r--bsie/reader/stat.py32
-rw-r--r--bsie/tools/__init__.py20
-rw-r--r--bsie/tools/builder.py226
-rw-r--r--bsie/tools/pipeline.py144
-rw-r--r--bsie/utils/__init__.py22
-rw-r--r--bsie/utils/bsfs.py27
-rw-r--r--bsie/utils/namespaces.py27
-rw-r--r--bsie/utils/node.py53
-rw-r--r--setup.py20
-rw-r--r--test/__init__.py0
-rw-r--r--test/apps/__init__.py0
-rw-r--r--test/apps/test_index.py159
-rw-r--r--test/apps/test_info.py42
-rw-r--r--test/apps/testdir/alpha/alpha_first16
-rw-r--r--test/apps/testdir/alpha/alpha_second12
-rw-r--r--test/apps/testdir/alpha/omega/omega_first14
-rw-r--r--test/apps/testdir/alpha/omega/omega_second10
-rw-r--r--test/apps/testdir/foo/bar/bar_first20
-rw-r--r--test/apps/testdir/foo/bar/bar_second14
-rw-r--r--test/apps/testdir/foo/foo_first11
-rw-r--r--test/apps/testdir/foo/foo_second12
-rw-r--r--test/apps/testdir/td_first18
-rw-r--r--test/apps/testdir/td_second14
-rw-r--r--test/apps/testfile16
-rw-r--r--test/base/__init__.py0
-rw-r--r--test/base/test_extractor.py70
-rw-r--r--test/base/test_reader.py45
-rw-r--r--test/extractor/__init__.py0
-rw-r--r--test/extractor/generic/__init__.py0
-rw-r--r--test/extractor/generic/test_constant.py124
-rw-r--r--test/extractor/generic/test_path.py74
-rw-r--r--test/extractor/generic/test_stat.py73
-rw-r--r--test/lib/__init__.py0
-rw-r--r--test/lib/test_bsie.py179
-rw-r--r--test/lib/testfile.t1
-rw-r--r--test/reader/__init__.py0
-rw-r--r--test/reader/test_path.py28
-rw-r--r--test/reader/test_stat.py34
-rw-r--r--test/tools/__init__.py0
-rw-r--r--test/tools/test_builder.py246
-rw-r--r--test/tools/test_pipeline.py176
-rw-r--r--test/tools/testfile.t1
-rw-r--r--test/utils/__init__.py0
-rw-r--r--test/utils/test_node.py65
67 files changed, 3205 insertions, 0 deletions
diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000..40f07cc
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,15 @@
+[run]
+dynamic_context = test_function
+branch = True
+source = bsie
+data_file = .coverage
+command_line = -m unittest
+
+[report]
+show_missing = True
+skip_empty = True
+
+[html]
+directory = .htmlcov
+show_contexts = True
+
diff --git a/.mypy.ini b/.mypy.ini
new file mode 100644
index 0000000..4d0a25d
--- /dev/null
+++ b/.mypy.ini
@@ -0,0 +1,3 @@
+[mypy]
+ignore_missing_imports = True
+packages=bsie
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 0000000..1b34854
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,193 @@
+[MAIN]
+
+# Pickle collected data for later comparisons.
+persistent=no
+
+# Minimum Python version to use for version dependent checks. Will default to
+# the version used to run pylint.
+py-version=3.8
+
+# Discover python modules and packages in the file system subtree.
+recursive=yes
+
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+
+
+[BASIC]
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+# Bad variable names which should always be refused, separated by a comma.
+bad-names=foo,bar,abc,cba,xyz,zyx,foobar,hello,world
+
+# Good variable names which should always be accepted, separated by a comma.
+good-names=i,j,k,n,_
+
+# Naming style matching correct argument names.
+argument-naming-style=snake_case
+
+# Naming style matching correct attribute names.
+attr-naming-style=snake_case
+
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+
+# Naming style matching correct class constant names.
+class-const-naming-style=UPPER_CASE
+
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+
+# Naming style matching correct function names.
+function-naming-style=snake_case
+
+# Include a hint for the correct naming format with invalid-name.
+include-naming-hint=yes
+
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+
+# Naming style matching correct method names.
+method-naming-style=snake_case
+
+# Naming style matching correct module names.
+module-naming-style=snake_case
+
+# Naming style matching correct variable names.
+variable-naming-style=snake_case
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method.
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of boolean expressions in an if statement (see R0916).
+max-bool-expr=5
+
+# Maximum number of branch for function / method body.
+max-branches=12
+
+# Maximum number of locals for function / method body.
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body.
+max-returns=6
+
+# Maximum number of statements in function / method body.
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=1
+
+
+[FORMAT]
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
+# tab).
+indent-string=' '
+
+# Maximum number of characters on a single line.
+max-line-length=120
+
+# Maximum number of lines in a module.
+max-module-lines=1000
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+
+[IMPORTS]
+
+# List of modules that can be imported at any level, not just the top level
+# one.
+allow-any-import-level=
+
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+
+
+[LOGGING]
+
+# The type of string formatting that logging methods do. `old` means using %
+# formatting, `new` is for `{}` formatting.
+logging-format-style=old
+
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,TODO
+
+
+
+[REPORTS]
+
+# Tells whether to display a full report or only the messages.
+reports=yes
+
+# Activate the evaluation score.
+score=yes
+
+
+[SIMILARITIES]
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+
+[STRING]
+
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=yes
+
+
+[TYPECHECK]
+
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=no
+
+
+[VARIABLES]
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=clbk,callback
+
+
+
+
+# Disable: R1735 (use-dict-literal)
diff --git a/README b/README
index b790244..3326196 100644
--- a/README
+++ b/README
@@ -3,3 +3,54 @@ Black Star Information Extraction
=================================
+### Developer tools setup
+
+#### Test coverage (coverage)
+
+Resources:
+* https://coverage.readthedocs.io/en/6.5.0/index.html
+* https://nedbatchelder.com/blog/200710/flaws_in_coverage_measurement.html
+
+Commands:
+$ pip install coverage
+$ coverage run ; coverage html ; xdg-open .htmlcov/index.html
+
+
+
+#### Static code analysis (pylint)
+
+Resources:
+* https://github.com/PyCQA/pylint
+* https://pylint.org/
+* https://pylint.pycqa.org/en/latest/user_guide/messages/messages_overview.html#messages-overview
+
+Commands:
+$ pip install pylint
+$ pylint bsie
+
+
+
+#### Type analysis (mypy)
+
+Resources:
+* https://github.com/python/mypy
+* https://mypy.readthedocs.io/en/stable/
+
+Commands:
+$ pip install mypy
+$ mypy
+
+
+
+#### Documentation (sphinx)
+
+Resources:
+*
+*
+
+Commands:
+$ pip install ...
+$
+
+
+
diff --git a/bsie.app b/bsie.app
new file mode 100755
index 0000000..ba9cee7
--- /dev/null
+++ b/bsie.app
@@ -0,0 +1,49 @@
+"""BSIE tools.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import argparse
+import typing
+
+# module imports
+import bsie
+import bsie.apps
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'main',
+ )
+
+# config
+apps = {
+ 'index' : bsie.apps.index,
+ 'info' : bsie.apps.info,
+ }
+
+
+## code ##
+
+def main(argv):
+ """Black Star File System maintenance tools."""
+ parser = argparse.ArgumentParser(description=main.__doc__, prog='bsie')
+ parser.add_argument('--version', action='version',
+ version='%(prog)s version {}.{}.{}'.format(*bsie.version_info))
+ parser.add_argument('app', choices=apps.keys(),
+ help='Select the application to run.')
+ parser.add_argument('rest', nargs=argparse.REMAINDER)
+ # parse
+ args = parser.parse_args()
+ # run application
+ apps[args.app](args.rest)
+
+
+## main ##
+
+if __name__ == '__main__':
+ import sys
+ main(sys.argv[1:])
+
+## EOF ##
diff --git a/bsie.toml b/bsie.toml
new file mode 100644
index 0000000..10b0f37
--- /dev/null
+++ b/bsie.toml
@@ -0,0 +1,11 @@
+[project]
+name = "bsie"
+description = "Extract information from files and store them in a BSFS."
+version = "0.0.1"
+license = {text = "BSD 3-Clause License"}
+authors = [{name='Matthias Baumgartner', email="dev@igsor.net"}]
+dependencies = [
+ "rdflib",
+ "bsfs",
+]
+requires-python = ">=3.7"
diff --git a/bsie/__init__.py b/bsie/__init__.py
new file mode 100644
index 0000000..8d2308c
--- /dev/null
+++ b/bsie/__init__.py
@@ -0,0 +1,18 @@
+"""The BSIE module extracts triples from files for insertion into a BSFS storage.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import collections
+import typing
+
+# constants
+T_VERSION_INFO = collections.namedtuple('T_VERSION_INFO', ('major', 'minor', 'micro')) # pylint: disable=invalid-name
+version_info = T_VERSION_INFO(0, 0, 1)
+
+# exports
+__all__: typing.Sequence[str] = []
+
+## EOF ##
diff --git a/bsie/apps/__init__.py b/bsie/apps/__init__.py
new file mode 100644
index 0000000..a548c3c
--- /dev/null
+++ b/bsie/apps/__init__.py
@@ -0,0 +1,20 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from .index import main as index
+from .info import main as info
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'index',
+ 'info',
+ )
+
+## EOF ##
diff --git a/bsie/apps/index.py b/bsie/apps/index.py
new file mode 100644
index 0000000..1dbfdd8
--- /dev/null
+++ b/bsie/apps/index.py
@@ -0,0 +1,121 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import argparse
+import os
+import typing
+
+# bsie imports
+from bsie.base import errors
+from bsie.lib import BSIE
+from bsie.tools import builder
+from bsie.utils import bsfs
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'main',
+ )
+
+
+## code ##
+
+def main(argv):
+ """Index files or directories into BSFS."""
+ parser = argparse.ArgumentParser(description=main.__doc__, prog='index')
+ parser.add_argument('--user', type=bsfs.URI, default=bsfs.URI('http://example.com/me'),
+ help='')
+ parser.add_argument('--collect', action='append', default=[],
+ help='')
+ parser.add_argument('--discard', action='append', default=[],
+ help='')
+ parser.add_argument('-r', '--recursive', action='store_true', default=False,
+ help='')
+ parser.add_argument('--follow', action='store_true', default=False,
+ help='')
+ parser.add_argument('--print', action='store_true', default=False,
+ help='')
+ parser.add_argument('input_file', nargs=argparse.REMAINDER,
+ help='')
+ args = parser.parse_args(argv)
+
+ # FIXME: Read reader/extractor configs from a config file
+ # reader builder
+ rbuild = builder.ReaderBuilder({})
+ # extractor builder
+ ebuild = builder.ExtractorBuilder([
+ {'bsie.extractor.generic.path.Path': {}},
+ {'bsie.extractor.generic.stat.Stat': {}},
+ {'bsie.extractor.generic.constant.Constant': dict(
+ tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
+ schema='''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ ''',
+ )},
+ ])
+ # pipeline builder
+ pbuild = builder.PipelineBuilder(
+ bsfs.Namespace(args.user + ('/' if not args.user.endswith('/') else '')),
+ rbuild,
+ ebuild,
+ )
+
+ # build pipeline
+ pipeline = pbuild.build()
+ # build BSIE frontend
+ bsie = BSIE(pipeline, args.collect, args.discard)
+
+
+ def walk(handle):
+ """Walk through given input files."""
+ # FIXME: collect all triples by node, set all predicates at once
+ # FIXME: simplify code (below but maybe also above)
+ # FIXME: How to handle dependencies between data?
+ # E.g. do I still want to link to a tag despite not being permitted to set its label?
+ # FIXME: node renaming?
+
+ # index input paths
+ for path in args.input_file:
+ if os.path.isdir(path) and args.recursive:
+ for dirpath, _, filenames in os.walk(path, topdown=True, followlinks=args.follow):
+ for filename in filenames:
+ for node, pred, value in bsie.from_file(os.path.join(dirpath, filename)):
+ handle(node, pred, value)
+ elif os.path.isfile(path):
+ for node, pred, value in bsie.from_file(path):
+ handle(node, pred, value)
+ else:
+ raise errors.UnreachableError()
+
+
+ if args.print:
+ walk(print)
+ return None
+
+ # initialize bsfs
+ # NOTE: With presistent storages, the schema migration will be a seaparte operation.
+ # Here, we'd simply examine the schema and potentially discard more predicates.
+ store = bsfs.Open(bsfs.init_sparql_store(args.user))
+ store.migrate(bsie.schema)
+ # process files
+ def handle(node, pred, value):
+ store.node(node.node_type, node.uri).set(pred.uri, value)
+ walk(handle)
+ # return store
+ return store
+
+
+
+## main ##
+
+if __name__ == '__main__':
+ import sys
+ main(sys.argv[1:])
+
+## EOF ##
diff --git a/bsie/apps/info.py b/bsie/apps/info.py
new file mode 100644
index 0000000..eaf1f71
--- /dev/null
+++ b/bsie/apps/info.py
@@ -0,0 +1,74 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import argparse
+import sys
+import typing
+
+# bsie imports
+from bsie.base import errors
+from bsie.tools import builder
+from bsie.utils import bsfs
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'main',
+ )
+
+
+## code ##
+
+def main(argv):
+ """Show information from BSIE."""
+ parser = argparse.ArgumentParser(description=main.__doc__, prog='info')
+ parser.add_argument('what', choices=('predicates', ),
+ help='Select what information to show.')
+ args = parser.parse_args(argv)
+
+ # FIXME: Read reader/extractor configs from a config file
+ # reader builder
+ rbuild = builder.ReaderBuilder({})
+ # extractor builder
+ ebuild = builder.ExtractorBuilder([
+ {'bsie.extractor.generic.path.Path': {}},
+ {'bsie.extractor.generic.stat.Stat': {}},
+ {'bsie.extractor.generic.constant.Constant': dict(
+ tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
+ schema='''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ ''',
+ )},
+ ])
+ # pipeline builder
+ pbuild = builder.PipelineBuilder(
+ bsfs.Namespace('http://example.com/me/'), # not actually used
+ rbuild,
+ ebuild,
+ )
+
+ # build pipeline
+ pipeline = pbuild.build()
+
+ # show info
+ if args.what == 'predicates':
+ # show predicates
+ for pred in pipeline.schema.predicates():
+ print(pred.uri)
+ else:
+ # args.what is already checked by argparse
+ raise errors.UnreachableError()
+
+
+## main ##
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
+
+## EOF ##
diff --git a/bsie/base/__init__.py b/bsie/base/__init__.py
new file mode 100644
index 0000000..0d362cd
--- /dev/null
+++ b/bsie/base/__init__.py
@@ -0,0 +1,24 @@
+"""The base module defines the BSIE interfaces.
+
+You'll mostly find abstract classes here.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from . import errors
+from .extractor import Extractor
+from .reader import Reader
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Extractor',
+ 'Reader',
+ 'errors',
+ )
+
+## EOF ##
diff --git a/bsie/base/errors.py b/bsie/base/errors.py
new file mode 100644
index 0000000..dc3c30e
--- /dev/null
+++ b/bsie/base/errors.py
@@ -0,0 +1,42 @@
+"""Common BSIE exceptions.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'BuilderError',
+ 'ExtractorError',
+ 'LoaderError',
+ 'ReaderError',
+ )
+
+
+## code ##
+
+class _BSIEError(Exception):
+ """Generic BSIE error."""
+
+class BuilderError(_BSIEError):
+ """The Builder failed to create an instance."""
+
+class LoaderError(BuilderError):
+ """Failed to load a module or class."""
+
+class ExtractorError(_BSIEError):
+ """The Extractor failed to process the given content."""
+
+class ReaderError(_BSIEError):
+ """The Reader failed to read the given file."""
+
+class ProgrammingError(_BSIEError):
+ """An assertion-like error that indicates a code-base issue."""
+
+class UnreachableError(ProgrammingError):
+ """Bravo, you've reached a point in code that should logically not be reachable."""
+
+## EOF ##
diff --git a/bsie/base/extractor.py b/bsie/base/extractor.py
new file mode 100644
index 0000000..c44021b
--- /dev/null
+++ b/bsie/base/extractor.py
@@ -0,0 +1,103 @@
+"""The Extractor classes transform content into triples.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import abc
+import typing
+
+# bsie imports
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Extractor',
+ )
+
+# constants
+
+# essential definitions typically used in extractor schemas.
+# NOTE: This preamble is only for convenience; Each Extractor must implement its use, if so desired.
+SCHEMA_PREAMBLE = '''
+ # common external prefixes
+ prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+ prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+ prefix xsd: <http://www.w3.org/2001/XMLSchema#>
+ prefix schema: <http://schema.org/>
+
+ # common bsfs prefixes
+ prefix bsfs: <http://bsfs.ai/schema/>
+ prefix bse: <http://bsfs.ai/schema/Entity#>
+
+ # essential nodes
+ bsfs:Entity rdfs:subClassOf bsfs:Node .
+ bsfs:File rdfs:subClassOf bsfs:Entity .
+
+ # common definitions
+ xsd:string rdfs:subClassOf bsfs:Literal .
+ xsd:integer rdfs:subClassOf bsfs:Literal .
+
+ '''
+
+
+## code ##
+
+class Extractor(abc.ABC):
+ """Produce (subject, predicate, value)-triples from some content.
+ The Extractor produces princpal predicates that provide information
+ about the content itself (i.e., triples that include the subject),
+ and may also generate triples with auxiliary predicates if the
+ extracted value is a node itself.
+ """
+
+ # what type of content is expected (i.e. reader subclass).
+ CONTENT_READER: typing.Optional[str] = None
+
+ # extractor schema.
+ _schema: bsfs.schema.Schema
+
+ def __init__(self, schema: bsfs.schema.Schema):
+ self._schema = schema
+
+ def __str__(self) -> str:
+ return bsfs.typename(self)
+
+ def __repr__(self) -> str:
+ return f'{bsfs.typename(self)}()'
+
+ def __eq__(self, other: typing.Any) -> bool:
+ return isinstance(other, type(self)) \
+ and self.CONTENT_READER == other.CONTENT_READER \
+ and self.schema == other.schema
+
+ def __hash__(self) -> int:
+ return hash((type(self), self.CONTENT_READER, self.schema))
+
+ @property
+ def schema(self) -> bsfs.schema.Schema:
+ """Return the extractor's schema."""
+ return self._schema
+
+ @property
+ def principals(self) -> typing.Iterator[bsfs.schema.Predicate]:
+ """Return the principal predicates, i.e., relations from/to the extraction subject."""
+ ent = self.schema.node(ns.bsfs.Entity)
+ return (
+ pred
+ for pred
+ in self.schema.predicates()
+ if pred.domain <= ent or (pred.range is not None and pred.range <= ent)
+ )
+
+ @abc.abstractmethod
+ def extract(
+ self,
+ subject: node.Node,
+ content: typing.Any,
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ """Return (node, predicate, value) triples."""
+
+## EOF ##
diff --git a/bsie/base/reader.py b/bsie/base/reader.py
new file mode 100644
index 0000000..cbabd36
--- /dev/null
+++ b/bsie/base/reader.py
@@ -0,0 +1,47 @@
+"""The Reader classes return high-level content structures from files.
+
+The Reader fulfills two purposes:
+ First, it brokers between multiple libraries and file formats.
+ Second, it separates multiple aspects of a file into distinct content types.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import abc
+import typing
+
+# bsie imports
+from bsie.utils import bsfs
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Reader',
+ )
+
+
+## code ##
+
+class Reader(abc.ABC):
+ """Read and return some content from a file."""
+
+ def __str__(self) -> str:
+ return bsfs.typename(self)
+
+ def __repr__(self) -> str:
+ return f'{bsfs.typename(self)}()'
+
+ def __eq__(self, other: typing.Any) -> bool:
+ return isinstance(other, type(self))
+
+ def __hash__(self) -> int:
+ return hash(type(self))
+
+ @abc.abstractmethod
+ def __call__(self, path: bsfs.URI) -> typing.Any:
+ """Return some content of the file at *path*.
+ Raises a `ReaderError` if the reader cannot make sense of the file format.
+ """
+
+## EOF ##
diff --git a/bsie/extractor/__init__.py b/bsie/extractor/__init__.py
new file mode 100644
index 0000000..ef31343
--- /dev/null
+++ b/bsie/extractor/__init__.py
@@ -0,0 +1,15 @@
+"""Extractors produce triples from some content.
+
+Each Extractor class is linked to the Reader class whose content it requires.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# exports
+__all__: typing.Sequence[str] = []
+
+## EOF ##
diff --git a/bsie/extractor/generic/__init__.py b/bsie/extractor/generic/__init__.py
new file mode 100644
index 0000000..0cb7e7f
--- /dev/null
+++ b/bsie/extractor/generic/__init__.py
@@ -0,0 +1,16 @@
+"""Generic extractors focus on information that is typically available on all
+files. Examples include file system information (file name and size, mime type,
+etc.) and information that is independent of the actual file (constant triples,
+host platform infos, current time, etc.).
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# exports
+__all__: typing.Sequence[str] = []
+
+## EOF ##
diff --git a/bsie/extractor/generic/constant.py b/bsie/extractor/generic/constant.py
new file mode 100644
index 0000000..11384e6
--- /dev/null
+++ b/bsie/extractor/generic/constant.py
@@ -0,0 +1,57 @@
+"""The Constant extractor produces pre-specified triples.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsie imports
+from bsie.base import extractor
+from bsie.utils import bsfs, node
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Constant',
+ )
+
+
+## code ##
+
+class Constant(extractor.Extractor):
+ """Extract information from file's path."""
+
+ CONTENT_READER = None
+
+ # predicate/value pairs to be produced.
+ _tuples: typing.Tuple[typing.Tuple[bsfs.schema.Predicate, typing.Any], ...]
+
+ def __init__(
+ self,
+ schema: str,
+ tuples: typing.Iterable[typing.Tuple[bsfs.URI, typing.Any]],
+ ):
+ super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + schema))
+ # NOTE: Raises a KeyError if the predicate is not part of the schema
+ self._tuples = tuple((self.schema.predicate(p_uri), value) for p_uri, value in tuples)
+ # TODO: use schema instance for value checking
+
+ def __eq__(self, other: typing.Any) -> bool:
+ return super().__eq__(other) \
+ and self._tuples == other._tuples
+
+ def __hash__(self) -> int:
+ return hash((super().__hash__(), self._tuples))
+
+ def extract(
+ self,
+ subject: node.Node,
+ content: None,
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ for pred, value in self._tuples:
+ if pred in principals:
+ yield subject, pred, value
+
+## EOF ##
diff --git a/bsie/extractor/generic/path.py b/bsie/extractor/generic/path.py
new file mode 100644
index 0000000..7018e12
--- /dev/null
+++ b/bsie/extractor/generic/path.py
@@ -0,0 +1,74 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import os
+import typing
+
+# bsie imports
+from bsie.base import extractor
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Path',
+ )
+
+
+## code ##
+
+class Path(extractor.Extractor):
+ """Extract information from file's path."""
+
+ CONTENT_READER = 'bsie.reader.path.Path'
+
+ # mapping from predicate to handler function.
+ _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]]
+
+ def __init__(self):
+ super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:filename rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range xsd:string ;
+ rdfs:label "File name"^^xsd:string ;
+ schema:description "Filename of entity in some filesystem."^^xsd:string ;
+ bsfs:unique "false"^^xsd:boolean .
+ '''))
+ self._callmap = {
+ self.schema.predicate(ns.bse.filename): self.__filename,
+ }
+
+ def extract(
+ self,
+ subject: node.Node,
+ content: str,
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ for pred in principals:
+ # find callback
+ clbk = self._callmap.get(pred)
+ if clbk is None:
+ continue
+ # get value
+ value = clbk(content)
+ if value is None:
+ continue
+ # produce triple
+ yield subject, pred, value
+
+ def __filename(self, path: str) -> typing.Optional[str]:
+ try:
+ return os.path.basename(path)
+ except Exception: # pylint: disable=broad-except # we explicitly want to catch everything
+ # some error, skip
+ # FIXME: some kind of error reporting (e.g. logging)?
+ # Options: (a) Fail silently (current); (b) Skip and report to log;
+ # (c) Raise ExtractorError (aborts extraction); (d) separate content type
+ # checks from basename errors (report content type errors, skip basename
+ # errors)
+ return None
+
+## EOF ##
diff --git a/bsie/extractor/generic/stat.py b/bsie/extractor/generic/stat.py
new file mode 100644
index 0000000..0b9ce29
--- /dev/null
+++ b/bsie/extractor/generic/stat.py
@@ -0,0 +1,70 @@
+"""Extract information from the file system, such as filesize.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import os
+import typing
+
+# bsie imports
+from bsie.base import extractor
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Stat',
+ )
+
+
+## code ##
+
+class Stat(extractor.Extractor):
+ """Extract information from the file system."""
+
+ CONTENT_READER = 'bsie.reader.stat.Stat'
+
+ # mapping from predicate to handler function.
+ _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[os.stat_result], typing.Any]]
+
+ def __init__(self):
+ super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:filesize rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range xsd:integer ;
+ rdfs:label "File size"^^xsd:string ;
+ schema:description "File size of entity in some filesystem."^^xsd:string ;
+ bsfs:unique "false"^^xsd:boolean .
+ '''))
+ self._callmap = {
+ self.schema.predicate(ns.bse.filesize): self.__filesize,
+ }
+
+ def extract(
+ self,
+ subject: node.Node,
+ content: os.stat_result,
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ for pred in principals:
+ # find callback
+ clbk = self._callmap.get(pred)
+ if clbk is None:
+ continue
+ # get value
+ value = clbk(content)
+ if value is None:
+ continue
+ # produce triple
+ yield subject, pred, value
+
+ def __filesize(self, content: os.stat_result) -> typing.Optional[int]:
+ """Return the file size."""
+ try:
+ return content.st_size
+ except Exception: # pylint: disable=broad-except # we explicitly want to catch everything
+ # FIXME: some kind of error reporting (e.g. logging)
+ return None
+
+## EOF ##
diff --git a/bsie/lib/__init__.py b/bsie/lib/__init__.py
new file mode 100644
index 0000000..578c2c4
--- /dev/null
+++ b/bsie/lib/__init__.py
@@ -0,0 +1,18 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from .bsie import BSIE
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'BSIE',
+ )
+
+## EOF ##
diff --git a/bsie/lib/bsie.py b/bsie/lib/bsie.py
new file mode 100644
index 0000000..e087fa9
--- /dev/null
+++ b/bsie/lib/bsie.py
@@ -0,0 +1,92 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsie imports
+from bsie.tools import Pipeline
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'BSIE',
+ )
+
+
+## code ##
+
+class BSIE():
+ """Extract triples from files.
+
+ Controls which predicates to extract (*collect*) and
+ which to not extract (*discard*). Note that this only affects
+ principal predicates not auxililary predicates like, e.g., tag labels.
+
+ """
+
+ # pipeline
+ _pipeline: Pipeline
+
+ # predicates to extract.
+ _principals: typing.Set[bsfs.URI]
+
+ # local schema.
+ _schema: bsfs.schema.Schema
+
+ def __init__(
+ self,
+ # pipeline builder.
+ pipeline: Pipeline,
+ # principals to extract at most. None implies all available w.r.t. extractors.
+ collect: typing.Optional[typing.Iterable[bsfs.URI]] = None,
+ # principals to discard.
+ discard: typing.Optional[typing.Iterable[bsfs.URI]] = None,
+ ):
+ # store pipeline
+ self._pipeline = pipeline
+ # start off with available principals
+ self._principals = {pred.uri for pred in self._pipeline.principals}
+ # limit principals to specified ones by argument.
+ if collect is not None:
+ collect = set(collect)
+ if len(collect) > 0:
+ self._principals &= collect
+ # discard principals.
+ if discard is not None:
+ self._principals -= set(discard)
+ # discard ns.bsfs.Predicate
+ self._principals.discard(ns.bsfs.Predicate)
+ # compile a schema that only contains the requested principals (and auxiliary predicates)
+ self._schema = self._pipeline.subschema(
+ self._pipeline.schema.predicate(pred) for pred in self._principals)
+
+ @property
+ def schema(self) -> bsfs.schema.Schema:
+ """Return the BSIE schema."""
+ return self._schema
+
+ @property
+ def principals(self) -> typing.Iterator[bsfs.URI]:
+ """Return an iterator to the principal predicates."""
+ return iter(self._principals)
+
+ def from_file(
+ self,
+ path: bsfs.URI,
+ principals: typing.Optional[typing.Iterable[bsfs.URI]] = None,
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.URI, typing.Any]]:
+ """Produce triples for a given *path*. Limit to *principals* if given."""
+ # get requested principals.
+ principals = set(principals) if principals is not None else self._principals
+ # filter through requested principals.
+ principals &= self._principals
+ # predicate lookup
+ principals = {self.schema.predicate(pred) for pred in principals}
+ # invoke pipeline
+ yield from self._pipeline(path, principals)
+
+## EOF ##
diff --git a/bsie/reader/__init__.py b/bsie/reader/__init__.py
new file mode 100644
index 0000000..a45f22b
--- /dev/null
+++ b/bsie/reader/__init__.py
@@ -0,0 +1,19 @@
+"""The Reader classes return high-level content structures from files.
+
+The Reader fulfills two purposes:
+ First, it brokers between multiple libraries and file formats.
+ Second, it separates multiple aspects of a file into distinct content types.
+
+Often, different libraries focus on reading different types of content from a
+file. E.g. one would use different modules to read file system infos than to
+read exif or pixel data of an image. Hence, this module is organized by content
+type. Each distinct type can be implemented in a file or submodule that
+provides a Reader implementation. Through utilization of submodules, different
+file formats can be supported.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+
+## EOF ##
diff --git a/bsie/reader/path.py b/bsie/reader/path.py
new file mode 100644
index 0000000..d60f187
--- /dev/null
+++ b/bsie/reader/path.py
@@ -0,0 +1,28 @@
+"""The Path reader produces a file path.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsie imports
+from bsie.base import reader
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Path',
+ )
+
+
+## code ##
+
+class Path(reader.Reader):
+ """Return the path."""
+
+ def __call__(self, path: str) -> str:
+ return path
+
+
+## EOF ##
diff --git a/bsie/reader/stat.py b/bsie/reader/stat.py
new file mode 100644
index 0000000..fc5fb24
--- /dev/null
+++ b/bsie/reader/stat.py
@@ -0,0 +1,32 @@
+"""The Stat reader produces filesystem stat information.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import os
+import typing
+
+# bsie imports
+from bsie.base import errors, reader
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Stat',
+ )
+
+
+## code ##
+
+class Stat(reader.Reader):
+ """Read and return the filesystem's stat infos."""
+
+ def __call__(self, path: str) -> os.stat_result:
+ try:
+ return os.stat(path)
+ except Exception as err:
+ raise errors.ReaderError(path) from err
+
+
+## EOF ##
diff --git a/bsie/tools/__init__.py b/bsie/tools/__init__.py
new file mode 100644
index 0000000..803c321
--- /dev/null
+++ b/bsie/tools/__init__.py
@@ -0,0 +1,20 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from . import builder
+from .pipeline import Pipeline
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'builder',
+ 'Pipeline',
+ )
+
+## EOF ##
diff --git a/bsie/tools/builder.py b/bsie/tools/builder.py
new file mode 100644
index 0000000..190d9bf
--- /dev/null
+++ b/bsie/tools/builder.py
@@ -0,0 +1,226 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import importlib
+import logging
+import typing
+
+# bsie imports
+from bsie import base
+from bsie.base import errors
+from bsie.utils import bsfs
+
+# inner-module imports
+from . import pipeline
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'ExtractorBuilder',
+ 'PipelineBuilder',
+ 'ReaderBuilder',
+ )
+
+
+## code ##
+
+logger = logging.getLogger(__name__)
+
+def _safe_load(module_name: str, class_name: str):
+ """Get a class from a module. Raise BuilderError if anything goes wrong."""
+ try:
+ # load the module
+ module = importlib.import_module(module_name)
+ except Exception as err:
+ # cannot import module
+ raise errors.LoaderError(f'cannot load module {module_name}') from err
+
+ try:
+ # get the class from the module
+ cls = getattr(module, class_name)
+ except Exception as err:
+ # cannot find the class
+ raise errors.LoaderError(f'cannot load class {class_name} from module {module_name}') from err
+
+ return cls
+
+
+def _unpack_name(name):
+ """Split a name into its module and class component (dot-separated)."""
+ if not isinstance(name, str):
+ raise TypeError(name)
+ if '.' not in name:
+ raise ValueError('name must be a qualified class name.')
+ module_name, class_name = name[:name.rfind('.')], name[name.rfind('.')+1:]
+ if module_name == '':
+ raise ValueError('name must be a qualified class name.')
+ return module_name, class_name
+
+
+class ReaderBuilder():
+ """Build `bsie.base.Reader` instances.
+
+ Readers are defined via their qualified class name
+ (e.g., bsie.reader.path.Path) and optional keyword
+ arguments that are passed to the constructor via
+ the *kwargs* argument (name as key, kwargs as value).
+ The ReaderBuilder keeps a cache of previously built
+ reader instances, as they are anyway built with
+ identical keyword arguments.
+
+ """
+
+ # keyword arguments
+ _kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]
+
+ # cached readers
+ _cache: typing.Dict[str, base.Reader]
+
+ def __init__(self, kwargs: typing.Dict[str, typing.Dict[str, typing.Any]]):
+ self._kwargs = kwargs
+ self._cache = {}
+
+ def build(self, name: str) -> base.Reader:
+ """Return an instance for the qualified class name."""
+ # return cached instance
+ if name in self._cache:
+ return self._cache[name]
+
+ # check name and get module/class components
+ module_name, class_name = _unpack_name(name)
+
+ # import reader class
+ cls = _safe_load(module_name, class_name)
+
+ # get kwargs
+ kwargs = self._kwargs.get(name, {})
+ if not isinstance(kwargs, dict):
+ raise TypeError(f'expected a kwargs dict, found {bsfs.typename(kwargs)}')
+
+ try: # build, cache, and return instance
+ obj = cls(**kwargs)
+ # cache instance
+ self._cache[name] = obj
+ # return instance
+ return obj
+
+ except Exception as err:
+ raise errors.BuilderError(f'failed to build reader {name} due to {bsfs.typename(err)}: {err}') from err
+
+
+class ExtractorBuilder():
+ """Build `bsie.base.Extractor instances.
+
+ It is permissible to build multiple instances of the same extractor
+ (typically with different arguments), hence the ExtractorBuilder
+ receives a list of build specifications. Each specification is
+ a dict with a single key (extractor's qualified name) and a dict
+ to be used as keyword arguments.
+ Example: [{'bsie.extractor.generic.path.Path': {}}, ]
+
+ """
+
+ # build specifications
+ _specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]
+
+ def __init__(self, specs: typing.List[typing.Dict[str, typing.Dict[str, typing.Any]]]):
+ self._specs = specs
+
+ def __iter__(self) -> typing.Iterator[int]:
+ """Iterate over extractor specifications."""
+ return iter(range(len(self._specs)))
+
+ def build(self, index: int) -> base.Extractor:
+ """Return an instance of the n'th extractor (n=*index*)."""
+ # get build instructions
+ specs = self._specs[index]
+
+ # check specs structure. expecting[{name: {kwargs}}]
+ if not isinstance(specs, dict):
+ raise TypeError(f'expected a dict, found {bsfs.typename(specs)}')
+ if len(specs) != 1:
+ raise TypeError(f'expected a dict of length one, found {len(specs)}')
+
+ # get name and args from specs
+ name = next(iter(specs.keys()))
+ kwargs = specs[name]
+
+ # check kwargs structure
+ if not isinstance(kwargs, dict):
+ raise TypeError(f'expected a dict, found {bsfs.typename(kwargs)}')
+
+ # check name and get module/class components
+ module_name, class_name = _unpack_name(name)
+
+ # import extractor class
+ cls = _safe_load(module_name, class_name)
+
+ try: # build and return instance
+ return cls(**kwargs)
+
+ except Exception as err:
+ raise errors.BuilderError(f'failed to build extractor {name} due to {bsfs.typename(err)}: {err}') from err
+
+
+class PipelineBuilder():
+ """Build `bsie.tools.pipeline.Pipeline` instances."""
+
+ # Prefix to be used in the Pipeline.
+ prefix: bsfs.Namespace
+
+ # builder for Readers.
+ rbuild: ReaderBuilder
+
+ # builder for Extractors.
+ ebuild: ExtractorBuilder
+
+ def __init__(
+ self,
+ prefix: bsfs.Namespace,
+ reader_builder: ReaderBuilder,
+ extractor_builder: ExtractorBuilder,
+ ):
+ self.prefix = prefix
+ self.rbuild = reader_builder
+ self.ebuild = extractor_builder
+
+ def build(self) -> pipeline.Pipeline:
+ """Return a Pipeline instance."""
+ ext2rdr = {}
+
+ for eidx in self.ebuild:
+ # build extractor
+ try:
+ ext = self.ebuild.build(eidx)
+
+ except errors.LoaderError as err: # failed to load extractor; skip
+ logger.error('failed to load extractor: %s', err)
+ continue
+
+ except errors.BuilderError as err: # failed to build instance; skip
+ logger.error(str(err))
+ continue
+
+ try:
+ # get reader required by extractor
+ if ext.CONTENT_READER is not None:
+ rdr = self.rbuild.build(ext.CONTENT_READER)
+ else:
+ rdr = None
+ # store extractor
+ ext2rdr[ext] = rdr
+
+ except errors.LoaderError as err: # failed to load reader
+ logger.error('failed to load reader: %s', err)
+
+ except errors.BuilderError as err: # failed to build reader
+ logger.error(str(err))
+
+ return pipeline.Pipeline(self.prefix, ext2rdr)
+
+
+
+## EOF ##
diff --git a/bsie/tools/pipeline.py b/bsie/tools/pipeline.py
new file mode 100644
index 0000000..20e8ddf
--- /dev/null
+++ b/bsie/tools/pipeline.py
@@ -0,0 +1,144 @@
+"""
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+from collections import defaultdict
+import logging
+import typing
+
+# bsie imports
+from bsie import base
+from bsie.utils import bsfs, node, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Pipeline',
+ )
+
+# constants
+FILE_PREFIX = 'file#'
+
+## code ##
+
+logger = logging.getLogger(__name__)
+
+class Pipeline():
+ """Extraction pipeline to generate triples from files.
+
+ The Pipeline binds readers and extractors, and performs
+ the necessary operations to produce triples from a file.
+ It takes a best-effort approach to extract as many triples
+ as possible. Errors during the extraction are passed over
+ and reported to the log.
+
+ """
+
+ # combined extractor schemas.
+ _schema: bsfs.schema.Schema
+
+ # node prefix.
+ _prefix: bsfs.Namespace
+
+ # extractor -> reader mapping
+ _ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]]
+
+ def __init__(
+ self,
+ prefix: bsfs.Namespace,
+ ext2rdr: typing.Dict[base.extractor.Extractor, typing.Optional[base.reader.Reader]]
+ ):
+ # store core members
+ self._prefix = prefix + FILE_PREFIX
+ self._ext2rdr = ext2rdr
+ # compile schema from all extractors
+ self._schema = bsfs.schema.Schema.Union(ext.schema for ext in ext2rdr)
+
+ def __str__(self) -> str:
+ return bsfs.typename(self)
+
+ def __repr__(self) -> str:
+ return f'{bsfs.typename(self)}(...)'
+
+ def __hash__(self) -> int:
+ return hash((type(self), self._prefix, self._schema, tuple(self._ext2rdr), tuple(self._ext2rdr.values())))
+
+ def __eq__(self, other: typing.Any) -> bool:
+ return isinstance(other, type(self)) \
+ and self._schema == other._schema \
+ and self._prefix == other._prefix \
+ and self._ext2rdr == other._ext2rdr
+
+ @property
+ def schema(self) -> bsfs.schema.Schema:
+ """Return the pipeline's schema (combined from all extractors)."""
+ return self._schema
+
+ @property
+ def principals(self) -> typing.Iterator[bsfs.schema.Predicate]:
+ """Return the principal predicates that can be extracted."""
+ return iter({pred for ext in self._ext2rdr for pred in ext.principals})
+
+ def subschema(self, principals: typing.Iterable[bsfs.schema.Predicate]) -> bsfs.schema.Schema:
+ """Return the subset of the schema that supports the given *principals*."""
+ # materialize principals
+ principals = set(principals)
+ # collect and combine schemas from extractors
+ return bsfs.schema.Schema.Union({
+ ext.schema
+ for ext
+ in self._ext2rdr
+ if not set(ext.principals).isdisjoint(principals)
+ })
+
+ def __call__(
+ self,
+ path: bsfs.URI,
+ principals: typing.Optional[typing.Iterable[bsfs.schema.Predicate]] = None,
+ ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
+ """Extract triples from the file at *path*. Optionally, limit triples to *principals*."""
+ # get principals
+ principals = set(principals) if principals is not None else set(self.schema.predicates())
+
+ # get extractors
+ extractors = {ext for ext in self._ext2rdr if not set(ext.principals).isdisjoint(principals)}
+
+ # corner-case short-cut
+ if len(extractors) == 0:
+ return
+
+ # get readers -> extractors mapping
+ rdr2ext = defaultdict(set)
+ for ext in extractors:
+ rdr = self._ext2rdr[ext]
+ rdr2ext[rdr].add(ext)
+
+ # create subject for file
+ uuid = bsfs.uuid.UCID.from_path(path)
+ subject = node.Node(ns.bsfs.File, self._prefix[uuid])
+
+ # extract information
+ for rdr, extrs in rdr2ext.items():
+ try:
+ # get content
+ content = rdr(path) if rdr is not None else None
+
+ # apply extractors on this content
+ for ext in extrs:
+ try:
+ # get predicate/value tuples
+ for subject, pred, value in ext.extract(subject, content, principals):
+ yield subject, pred, value
+
+ except base.errors.ExtractorError as err:
+ # critical extractor failure.
+ logger.error('%s failed to extract triples from content: %s', ext, err)
+
+ except base.errors.ReaderError as err:
+ # failed to read any content. skip.
+ logger.error('%s failed to read content: %s', rdr, err)
+
+
+## EOF ##
diff --git a/bsie/utils/__init__.py b/bsie/utils/__init__.py
new file mode 100644
index 0000000..bd22236
--- /dev/null
+++ b/bsie/utils/__init__.py
@@ -0,0 +1,22 @@
+"""Common tools and definitions.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from . import bsfs
+from . import namespaces as ns
+from . import node
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'bsfs',
+ 'node',
+ 'ns',
+ )
+
+## EOF ##
diff --git a/bsie/utils/bsfs.py b/bsie/utils/bsfs.py
new file mode 100644
index 0000000..0b88479
--- /dev/null
+++ b/bsie/utils/bsfs.py
@@ -0,0 +1,27 @@
+"""BSFS bridge, provides BSFS bindings for BSIE.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsfs imports
+from bsfs import Open, schema
+from bsfs.apps.init import init_sparql_store
+from bsfs.namespace import Namespace
+from bsfs.utils import URI, typename, uuid
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Namespace',
+ 'Open',
+ 'URI',
+ 'init_sparql_store',
+ 'schema',
+ 'typename',
+ 'uuid',
+ )
+
+## EOF ##
diff --git a/bsie/utils/namespaces.py b/bsie/utils/namespaces.py
new file mode 100644
index 0000000..a29fc1b
--- /dev/null
+++ b/bsie/utils/namespaces.py
@@ -0,0 +1,27 @@
+"""Default namespaces used throughout BSIE.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# inner-module imports
+from . import bsfs as _bsfs
+
+# constants
+bse = _bsfs.Namespace('http://bsfs.ai/schema/Entity')
+bsfs = _bsfs.Namespace('http://bsfs.ai/schema', fsep='/')
+bsm = _bsfs.Namespace('http://bsfs.ai/schema/Meta')
+xsd = _bsfs.Namespace('http://www.w3.org/2001/XMLSchema')
+
+# export
+__all__: typing.Sequence[str] = (
+ 'bse',
+ 'bsfs',
+ 'bsm',
+ 'xsd',
+ )
+
+## EOF ##
diff --git a/bsie/utils/node.py b/bsie/utils/node.py
new file mode 100644
index 0000000..ecf39cd
--- /dev/null
+++ b/bsie/utils/node.py
@@ -0,0 +1,53 @@
+"""Lighweight Node to bridge to BSFS.
+
+Part of the bsie module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import typing
+
+# bsie imports
+from bsie.utils import bsfs
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Node',
+ )
+
+
+## code ##
+
+class Node():
+ """Lightweight Node, disconnected from any bsfs structures."""
+
+ # node type.
+ node_type: bsfs.URI
+
+ # node URI.
+ uri: bsfs.URI
+
+ def __init__(
+ self,
+ node_type: bsfs.URI,
+ uri: bsfs.URI,
+ ):
+ # assign members
+ self.node_type = bsfs.URI(node_type)
+ self.uri = bsfs.URI(uri)
+
+ def __eq__(self, other: typing.Any) -> bool:
+ return isinstance(other, Node) \
+ and other.node_type == self.node_type \
+ and other.uri == self.uri
+
+ def __hash__(self) -> int:
+ return hash((type(self), self.node_type, self.uri))
+
+ def __str__(self) -> str:
+ return f'{bsfs.typename(self)}({self.node_type}, {self.uri})'
+
+ def __repr__(self) -> str:
+ return f'{bsfs.typename(self)}({self.node_type}, {self.uri})'
+
+## EOF ##
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..ee9e0fd
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,20 @@
+
+from setuptools import setup
+import os
+
+setup(
+ name='bsie',
+ version='0.0.1',
+ author='Matthias Baumgartner',
+ author_email='dev@igsor.net',
+ description='Extract information from files and store them in a BSFS.',
+ long_description=open(os.path.join(os.path.dirname(__file__), 'README')).read(),
+ license='BSD',
+ license_files=('LICENSE', ),
+ url='https://www.igsor.net/projects/blackstar/bsie/',
+ download_url='https://pip.igsor.net',
+ packages=('bsie', ),
+ install_requires=('rdflib', 'bsfs'),
+ python_requires=">=3.7",
+)
+
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/__init__.py
diff --git a/test/apps/__init__.py b/test/apps/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/apps/__init__.py
diff --git a/test/apps/test_index.py b/test/apps/test_index.py
new file mode 100644
index 0000000..9cdc656
--- /dev/null
+++ b/test/apps/test_index.py
@@ -0,0 +1,159 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import contextlib
+import io
+import os
+import rdflib
+import unittest
+
+# bsie imports
+from bsie.utils import ns
+
+# objects to test
+from bsie.apps.index import main
+
+
+## code ##
+
+class TestIndex(unittest.TestCase):
+ def test_main(self):
+ bsfs = main([
+ '-r',
+ '--user', 'http://example.com/me',
+ os.path.join(os.path.dirname(__file__), 'testdir'),
+ os.path.join(os.path.dirname(__file__), 'testfile'),
+ ])
+
+ prefix = 'http://example.com/me/file#'
+ self.assertTrue(set(bsfs._backend._graph).issuperset({
+ (rdflib.URIRef(prefix + '2f4109b40107cc50e0884755a1a961ed126887e49b8dbaf0e146b2e226aa6647'), rdflib.RDF.type, rdflib.URIRef(ns.bsfs.File)),
+ (rdflib.URIRef(prefix + '2f4109b40107cc50e0884755a1a961ed126887e49b8dbaf0e146b2e226aa6647'), rdflib.URIRef(ns.bse.author), rdflib.Literal('Me, myself, and I', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + '2f4109b40107cc50e0884755a1a961ed126887e49b8dbaf0e146b2e226aa6647'), rdflib.URIRef(ns.bse.filename), rdflib.Literal('alpha_second', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + '2f4109b40107cc50e0884755a1a961ed126887e49b8dbaf0e146b2e226aa6647'), rdflib.URIRef(ns.bse.filesize), rdflib.Literal('696', datatype=rdflib.XSD.integer)),
+ (rdflib.URIRef(prefix + '441f3d10c8ff489fe8e33e639606512f6c463151cc429de7e554b9af670c2ece'), rdflib.RDF.type, rdflib.URIRef(ns.bsfs.File)),
+ (rdflib.URIRef(prefix + '441f3d10c8ff489fe8e33e639606512f6c463151cc429de7e554b9af670c2ece'), rdflib.URIRef(ns.bse.author), rdflib.Literal('Me, myself, and I', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + '441f3d10c8ff489fe8e33e639606512f6c463151cc429de7e554b9af670c2ece'), rdflib.URIRef(ns.bse.filename), rdflib.Literal('omega_second', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + '441f3d10c8ff489fe8e33e639606512f6c463151cc429de7e554b9af670c2ece'), rdflib.URIRef(ns.bse.filesize), rdflib.Literal('503', datatype=rdflib.XSD.integer)),
+ (rdflib.URIRef(prefix + '69b98ecf7aff3e95b09688ba93331678eb8397817111f674c9558e6dd8f5e871'), rdflib.RDF.type, rdflib.URIRef(ns.bsfs.File)),
+ (rdflib.URIRef(prefix + '69b98ecf7aff3e95b09688ba93331678eb8397817111f674c9558e6dd8f5e871'), rdflib.URIRef(ns.bse.author), rdflib.Literal('Me, myself, and I', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + '69b98ecf7aff3e95b09688ba93331678eb8397817111f674c9558e6dd8f5e871'), rdflib.URIRef(ns.bse.filename), rdflib.Literal('td_first', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + '69b98ecf7aff3e95b09688ba93331678eb8397817111f674c9558e6dd8f5e871'), rdflib.URIRef(ns.bse.filesize), rdflib.Literal('911', datatype=rdflib.XSD.integer)),
+ (rdflib.URIRef(prefix + '78f7eb7f0d8221cdb2cb26c978fa42a11f75eb87becc768f4474134cb1e06926'), rdflib.RDF.type, rdflib.URIRef(ns.bsfs.File)),
+ (rdflib.URIRef(prefix + '78f7eb7f0d8221cdb2cb26c978fa42a11f75eb87becc768f4474134cb1e06926'), rdflib.URIRef(ns.bse.author), rdflib.Literal('Me, myself, and I', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + '78f7eb7f0d8221cdb2cb26c978fa42a11f75eb87becc768f4474134cb1e06926'), rdflib.URIRef(ns.bse.filename), rdflib.Literal('testfile', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + '78f7eb7f0d8221cdb2cb26c978fa42a11f75eb87becc768f4474134cb1e06926'), rdflib.URIRef(ns.bse.filesize), rdflib.Literal('885', datatype=rdflib.XSD.integer)),
+ (rdflib.URIRef(prefix + '80818b8ec2ee1919116dba9c8a7e0a4608313cf3b463cd88e9ed77a700dd92d3'), rdflib.RDF.type, rdflib.URIRef(ns.bsfs.File)),
+ (rdflib.URIRef(prefix + '80818b8ec2ee1919116dba9c8a7e0a4608313cf3b463cd88e9ed77a700dd92d3'), rdflib.URIRef(ns.bse.author), rdflib.Literal('Me, myself, and I', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + '80818b8ec2ee1919116dba9c8a7e0a4608313cf3b463cd88e9ed77a700dd92d3'), rdflib.URIRef(ns.bse.filename), rdflib.Literal('bar_first', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + '80818b8ec2ee1919116dba9c8a7e0a4608313cf3b463cd88e9ed77a700dd92d3'), rdflib.URIRef(ns.bse.filesize), rdflib.Literal('956', datatype=rdflib.XSD.integer)),
+ (rdflib.URIRef(prefix + '976d2ea0e58488678cc7e435fbfadabfb6eb6cf50ad51862f38f73729ed11795'), rdflib.RDF.type, rdflib.URIRef(ns.bsfs.File)),
+ (rdflib.URIRef(prefix + '976d2ea0e58488678cc7e435fbfadabfb6eb6cf50ad51862f38f73729ed11795'), rdflib.URIRef(ns.bse.author), rdflib.Literal('Me, myself, and I', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + '976d2ea0e58488678cc7e435fbfadabfb6eb6cf50ad51862f38f73729ed11795'), rdflib.URIRef(ns.bse.filename), rdflib.Literal('omega_first', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + '976d2ea0e58488678cc7e435fbfadabfb6eb6cf50ad51862f38f73729ed11795'), rdflib.URIRef(ns.bse.filesize), rdflib.Literal('648', datatype=rdflib.XSD.integer)),
+ (rdflib.URIRef(prefix + '997e2fbb7494a3818ec782d2bc87bf1cffafba6b9c0f658e4a6c18a723e944d3'), rdflib.RDF.type, rdflib.URIRef(ns.bsfs.File)),
+ (rdflib.URIRef(prefix + '997e2fbb7494a3818ec782d2bc87bf1cffafba6b9c0f658e4a6c18a723e944d3'), rdflib.URIRef(ns.bse.author), rdflib.Literal('Me, myself, and I', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + '997e2fbb7494a3818ec782d2bc87bf1cffafba6b9c0f658e4a6c18a723e944d3'), rdflib.URIRef(ns.bse.filename), rdflib.Literal('alpha_first', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + '997e2fbb7494a3818ec782d2bc87bf1cffafba6b9c0f658e4a6c18a723e944d3'), rdflib.URIRef(ns.bse.filesize), rdflib.Literal('754', datatype=rdflib.XSD.integer)),
+ (rdflib.URIRef(prefix + 'a8af899ecdab60dfaea8ec7f934053624c80a1054539e163f2c7eaa986c2777d'), rdflib.RDF.type, rdflib.URIRef(ns.bsfs.File)),
+ (rdflib.URIRef(prefix + 'a8af899ecdab60dfaea8ec7f934053624c80a1054539e163f2c7eaa986c2777d'), rdflib.URIRef(ns.bse.author), rdflib.Literal('Me, myself, and I', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + 'a8af899ecdab60dfaea8ec7f934053624c80a1054539e163f2c7eaa986c2777d'), rdflib.URIRef(ns.bse.filename), rdflib.Literal('foo_second', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + 'a8af899ecdab60dfaea8ec7f934053624c80a1054539e163f2c7eaa986c2777d'), rdflib.URIRef(ns.bse.filesize), rdflib.Literal('585', datatype=rdflib.XSD.integer)),
+ (rdflib.URIRef(prefix + 'b8fd7fba818254166a6043195004138ebda6923e012442f819a2c49671136c70'), rdflib.RDF.type, rdflib.URIRef(ns.bsfs.File)),
+ (rdflib.URIRef(prefix + 'b8fd7fba818254166a6043195004138ebda6923e012442f819a2c49671136c70'), rdflib.URIRef(ns.bse.author), rdflib.Literal('Me, myself, and I', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + 'b8fd7fba818254166a6043195004138ebda6923e012442f819a2c49671136c70'), rdflib.URIRef(ns.bse.filename), rdflib.Literal('bar_second', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + 'b8fd7fba818254166a6043195004138ebda6923e012442f819a2c49671136c70'), rdflib.URIRef(ns.bse.filesize), rdflib.Literal('636', datatype=rdflib.XSD.integer)),
+ (rdflib.URIRef(prefix + 'd43758ace82154a1cc10ca0dfef63cb20dd831f9c87edd6dc06539eefe67371d'), rdflib.RDF.type, rdflib.URIRef(ns.bsfs.File)),
+ (rdflib.URIRef(prefix + 'd43758ace82154a1cc10ca0dfef63cb20dd831f9c87edd6dc06539eefe67371d'), rdflib.URIRef(ns.bse.author), rdflib.Literal('Me, myself, and I', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + 'd43758ace82154a1cc10ca0dfef63cb20dd831f9c87edd6dc06539eefe67371d'), rdflib.URIRef(ns.bse.filename), rdflib.Literal('foo_first', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + 'd43758ace82154a1cc10ca0dfef63cb20dd831f9c87edd6dc06539eefe67371d'), rdflib.URIRef(ns.bse.filesize), rdflib.Literal('546', datatype=rdflib.XSD.integer)),
+ (rdflib.URIRef(prefix + 'd803187cbf3676ae9d38126270a6152c60431589aa3bb3824baf8954e9c097f1'), rdflib.RDF.type, rdflib.URIRef(ns.bsfs.File)),
+ (rdflib.URIRef(prefix + 'd803187cbf3676ae9d38126270a6152c60431589aa3bb3824baf8954e9c097f1'), rdflib.URIRef(ns.bse.author), rdflib.Literal('Me, myself, and I', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + 'd803187cbf3676ae9d38126270a6152c60431589aa3bb3824baf8954e9c097f1'), rdflib.URIRef(ns.bse.filename), rdflib.Literal('td_second', datatype=rdflib.XSD.string)),
+ (rdflib.URIRef(prefix + 'd803187cbf3676ae9d38126270a6152c60431589aa3bb3824baf8954e9c097f1'), rdflib.URIRef(ns.bse.filesize), rdflib.Literal('703', datatype=rdflib.XSD.integer)),
+ }))
+
+ # NOTE: we don't check ns.bsm.t_created since it depends on the execution time. Triples would look like this:
+ # (rdflib.URIRef(prefix + '2f4109b40107cc50e0884755a1a961ed126887e49b8dbaf0e146b2e226aa6647'), rdflib.URIRef(ns.bsm.t_created), rdflib.Literal('1670..........', datatype=rdflib.XSD.integer)),
+ # (rdflib.URIRef(prefix + '441f3d10c8ff489fe8e33e639606512f6c463151cc429de7e554b9af670c2ece'), rdflib.URIRef(ns.bsm.t_created), rdflib.Literal('1670..........', datatype=rdflib.XSD.integer)),
+ # (rdflib.URIRef(prefix + '69b98ecf7aff3e95b09688ba93331678eb8397817111f674c9558e6dd8f5e871'), rdflib.URIRef(ns.bsm.t_created), rdflib.Literal('1670..........', datatype=rdflib.XSD.integer)),
+ # (rdflib.URIRef(prefix + '78f7eb7f0d8221cdb2cb26c978fa42a11f75eb87becc768f4474134cb1e06926'), rdflib.URIRef(ns.bsm.t_created), rdflib.Literal('1670..........', datatype=rdflib.XSD.integer)),
+ # (rdflib.URIRef(prefix + '80818b8ec2ee1919116dba9c8a7e0a4608313cf3b463cd88e9ed77a700dd92d3'), rdflib.URIRef(ns.bsm.t_created), rdflib.Literal('1670..........', datatype=rdflib.XSD.integer)),
+ # (rdflib.URIRef(prefix + '976d2ea0e58488678cc7e435fbfadabfb6eb6cf50ad51862f38f73729ed11795'), rdflib.URIRef(ns.bsm.t_created), rdflib.Literal('1670..........', datatype=rdflib.XSD.integer)),
+ # (rdflib.URIRef(prefix + '997e2fbb7494a3818ec782d2bc87bf1cffafba6b9c0f658e4a6c18a723e944d3'), rdflib.URIRef(ns.bsm.t_created), rdflib.Literal('1670..........', datatype=rdflib.XSD.integer)),
+ # (rdflib.URIRef(prefix + 'a8af899ecdab60dfaea8ec7f934053624c80a1054539e163f2c7eaa986c2777d'), rdflib.URIRef(ns.bsm.t_created), rdflib.Literal('1670..........', datatype=rdflib.XSD.integer)),
+ # (rdflib.URIRef(prefix + 'b8fd7fba818254166a6043195004138ebda6923e012442f819a2c49671136c70'), rdflib.URIRef(ns.bsm.t_created), rdflib.Literal('1670..........', datatype=rdflib.XSD.integer)),
+ # (rdflib.URIRef(prefix + 'd43758ace82154a1cc10ca0dfef63cb20dd831f9c87edd6dc06539eefe67371d'), rdflib.URIRef(ns.bsm.t_created), rdflib.Literal('1670..........', datatype=rdflib.XSD.integer)),
+ # (rdflib.URIRef(prefix + 'd803187cbf3676ae9d38126270a6152c60431589aa3bb3824baf8954e9c097f1'), rdflib.URIRef(ns.bsm.t_created), rdflib.Literal('1670..........', datatype=rdflib.XSD.integer)),
+ # instead, we simply check if there's such a predicate for each file
+ self.assertSetEqual({sub for sub, _ in bsfs._backend._graph.subject_objects(rdflib.URIRef(ns.bsm.t_created))}, {
+ rdflib.URIRef(prefix + '2f4109b40107cc50e0884755a1a961ed126887e49b8dbaf0e146b2e226aa6647'),
+ rdflib.URIRef(prefix + '441f3d10c8ff489fe8e33e639606512f6c463151cc429de7e554b9af670c2ece'),
+ rdflib.URIRef(prefix + '69b98ecf7aff3e95b09688ba93331678eb8397817111f674c9558e6dd8f5e871'),
+ rdflib.URIRef(prefix + '78f7eb7f0d8221cdb2cb26c978fa42a11f75eb87becc768f4474134cb1e06926'),
+ rdflib.URIRef(prefix + '80818b8ec2ee1919116dba9c8a7e0a4608313cf3b463cd88e9ed77a700dd92d3'),
+ rdflib.URIRef(prefix + '976d2ea0e58488678cc7e435fbfadabfb6eb6cf50ad51862f38f73729ed11795'),
+ rdflib.URIRef(prefix + '997e2fbb7494a3818ec782d2bc87bf1cffafba6b9c0f658e4a6c18a723e944d3'),
+ rdflib.URIRef(prefix + 'a8af899ecdab60dfaea8ec7f934053624c80a1054539e163f2c7eaa986c2777d'),
+ rdflib.URIRef(prefix + 'b8fd7fba818254166a6043195004138ebda6923e012442f819a2c49671136c70'),
+ rdflib.URIRef(prefix + 'd43758ace82154a1cc10ca0dfef63cb20dd831f9c87edd6dc06539eefe67371d'),
+ rdflib.URIRef(prefix + 'd803187cbf3676ae9d38126270a6152c60431589aa3bb3824baf8954e9c097f1'),
+ })
+
+ def test_print(self):
+ outbuf = io.StringIO()
+ with contextlib.redirect_stdout(outbuf):
+ bsfs = main([
+ '--print',
+ '-r',
+ '--user', 'http://example.com/me',
+ os.path.join(os.path.dirname(__file__), 'testdir'),
+ os.path.join(os.path.dirname(__file__), 'testfile'),
+ ])
+ self.assertSetEqual(set(outbuf.getvalue().split('\n')) - {''}, {
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#2f4109b40107cc50e0884755a1a961ed126887e49b8dbaf0e146b2e226aa6647) Predicate({ns.bse.author}) Me, myself, and I',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#2f4109b40107cc50e0884755a1a961ed126887e49b8dbaf0e146b2e226aa6647) Predicate({ns.bse.filename}) alpha_second',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#2f4109b40107cc50e0884755a1a961ed126887e49b8dbaf0e146b2e226aa6647) Predicate({ns.bse.filesize}) 696',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#441f3d10c8ff489fe8e33e639606512f6c463151cc429de7e554b9af670c2ece) Predicate({ns.bse.author}) Me, myself, and I',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#441f3d10c8ff489fe8e33e639606512f6c463151cc429de7e554b9af670c2ece) Predicate({ns.bse.filename}) omega_second',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#441f3d10c8ff489fe8e33e639606512f6c463151cc429de7e554b9af670c2ece) Predicate({ns.bse.filesize}) 503',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#69b98ecf7aff3e95b09688ba93331678eb8397817111f674c9558e6dd8f5e871) Predicate({ns.bse.author}) Me, myself, and I',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#69b98ecf7aff3e95b09688ba93331678eb8397817111f674c9558e6dd8f5e871) Predicate({ns.bse.filename}) td_first',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#69b98ecf7aff3e95b09688ba93331678eb8397817111f674c9558e6dd8f5e871) Predicate({ns.bse.filesize}) 911',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#78f7eb7f0d8221cdb2cb26c978fa42a11f75eb87becc768f4474134cb1e06926) Predicate({ns.bse.author}) Me, myself, and I',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#78f7eb7f0d8221cdb2cb26c978fa42a11f75eb87becc768f4474134cb1e06926) Predicate({ns.bse.filename}) testfile',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#78f7eb7f0d8221cdb2cb26c978fa42a11f75eb87becc768f4474134cb1e06926) Predicate({ns.bse.filesize}) 885',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#80818b8ec2ee1919116dba9c8a7e0a4608313cf3b463cd88e9ed77a700dd92d3) Predicate({ns.bse.author}) Me, myself, and I',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#80818b8ec2ee1919116dba9c8a7e0a4608313cf3b463cd88e9ed77a700dd92d3) Predicate({ns.bse.filename}) bar_first',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#80818b8ec2ee1919116dba9c8a7e0a4608313cf3b463cd88e9ed77a700dd92d3) Predicate({ns.bse.filesize}) 956',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#976d2ea0e58488678cc7e435fbfadabfb6eb6cf50ad51862f38f73729ed11795) Predicate({ns.bse.author}) Me, myself, and I',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#976d2ea0e58488678cc7e435fbfadabfb6eb6cf50ad51862f38f73729ed11795) Predicate({ns.bse.filename}) omega_first',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#976d2ea0e58488678cc7e435fbfadabfb6eb6cf50ad51862f38f73729ed11795) Predicate({ns.bse.filesize}) 648',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#997e2fbb7494a3818ec782d2bc87bf1cffafba6b9c0f658e4a6c18a723e944d3) Predicate({ns.bse.author}) Me, myself, and I',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#997e2fbb7494a3818ec782d2bc87bf1cffafba6b9c0f658e4a6c18a723e944d3) Predicate({ns.bse.filename}) alpha_first',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#997e2fbb7494a3818ec782d2bc87bf1cffafba6b9c0f658e4a6c18a723e944d3) Predicate({ns.bse.filesize}) 754',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#a8af899ecdab60dfaea8ec7f934053624c80a1054539e163f2c7eaa986c2777d) Predicate({ns.bse.author}) Me, myself, and I',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#a8af899ecdab60dfaea8ec7f934053624c80a1054539e163f2c7eaa986c2777d) Predicate({ns.bse.filename}) foo_second',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#a8af899ecdab60dfaea8ec7f934053624c80a1054539e163f2c7eaa986c2777d) Predicate({ns.bse.filesize}) 585',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#b8fd7fba818254166a6043195004138ebda6923e012442f819a2c49671136c70) Predicate({ns.bse.author}) Me, myself, and I',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#b8fd7fba818254166a6043195004138ebda6923e012442f819a2c49671136c70) Predicate({ns.bse.filename}) bar_second',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#b8fd7fba818254166a6043195004138ebda6923e012442f819a2c49671136c70) Predicate({ns.bse.filesize}) 636',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#d43758ace82154a1cc10ca0dfef63cb20dd831f9c87edd6dc06539eefe67371d) Predicate({ns.bse.author}) Me, myself, and I',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#d43758ace82154a1cc10ca0dfef63cb20dd831f9c87edd6dc06539eefe67371d) Predicate({ns.bse.filename}) foo_first',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#d43758ace82154a1cc10ca0dfef63cb20dd831f9c87edd6dc06539eefe67371d) Predicate({ns.bse.filesize}) 546',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#d803187cbf3676ae9d38126270a6152c60431589aa3bb3824baf8954e9c097f1) Predicate({ns.bse.author}) Me, myself, and I',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#d803187cbf3676ae9d38126270a6152c60431589aa3bb3824baf8954e9c097f1) Predicate({ns.bse.filename}) td_second',
+ f'Node(http://bsfs.ai/schema/File, http://example.com/me/file#d803187cbf3676ae9d38126270a6152c60431589aa3bb3824baf8954e9c097f1) Predicate({ns.bse.filesize}) 703',
+ })
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/apps/test_info.py b/test/apps/test_info.py
new file mode 100644
index 0000000..6f4d98f
--- /dev/null
+++ b/test/apps/test_info.py
@@ -0,0 +1,42 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import argparse
+import contextlib
+import io
+import unittest
+
+# objects to test
+from bsie.apps.info import main
+
+
+## code ##
+
+class TestIndex(unittest.TestCase):
+ def test_predicates(self):
+ outbuf = io.StringIO()
+ with contextlib.redirect_stdout(outbuf):
+ # show predicates infos
+ main(['predicates'])
+ # verify output
+ self.assertSetEqual({pred for pred in outbuf.getvalue().split('\n') if pred != ''}, {
+ 'http://bsfs.ai/schema/Entity#author',
+ 'http://bsfs.ai/schema/Predicate',
+ 'http://bsfs.ai/schema/Entity#filename',
+ 'http://bsfs.ai/schema/Entity#filesize',
+ })
+
+ def test_invalid(self):
+ with contextlib.redirect_stderr(io.StringIO()):
+ self.assertRaises(SystemExit, main, ['foobar'])
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/apps/testdir/alpha/alpha_first b/test/apps/testdir/alpha/alpha_first
new file mode 100644
index 0000000..f96fdee
--- /dev/null
+++ b/test/apps/testdir/alpha/alpha_first
@@ -0,0 +1,16 @@
+Turpis tincidunt id aliquet risus feugiat in ante metus.
+Vel turpis nunc eget lorem dolor.
+Lorem mollis aliquam ut porttitor leo a diam sollicitudin.
+Sit amet mattis vulputate enim nulla aliquet porttitor lacus luctus.
+Vitae et leo duis ut diam.
+Integer eget aliquet nibh praesent tristique magna sit.
+Volutpat sed cras ornare arcu dui.
+Consectetur adipiscing elit duis tristique sollicitudin nibh.
+Interdum varius sit amet mattis vulputate.
+A arcu cursus vitae congue.
+Risus nec feugiat in fermentum posuere urna nec tincidunt praesent.
+Sit amet dictum sit amet justo donec enim diam.
+Maecenas accumsan lacus vel facilisis.
+Erat velit scelerisque in dictum non consectetur a.
+Tempor orci dapibus ultrices in iaculis nunc.
+Nisi lacus sed viverra tellus.
diff --git a/test/apps/testdir/alpha/alpha_second b/test/apps/testdir/alpha/alpha_second
new file mode 100644
index 0000000..ae83ce8
--- /dev/null
+++ b/test/apps/testdir/alpha/alpha_second
@@ -0,0 +1,12 @@
+Et sollicitudin ac orci phasellus egestas tellus rutrum tellus.
+Orci dapibus ultrices in iaculis nunc sed augue.
+Tincidunt vitae semper quis lectus nulla at.
+Maecenas ultricies mi eget mauris pharetra et.
+Porttitor massa id neque aliquam vestibulum morbi blandit.
+Et magnis dis parturient montes nascetur ridiculus mus mauris.
+Ac orci phasellus egestas tellus rutrum tellus pellentesque.
+Donec ac odio tempor orci dapibus.
+Quis imperdiet massa tincidunt nunc pulvinar sapien et ligula.
+Potenti nullam ac tortor vitae purus faucibus ornare suspendisse sed.
+Orci porta non pulvinar neque laoreet suspendisse interdum consectetur.
+Mauris pellentesque pulvinar pellentesque habitant morbi tristique.
diff --git a/test/apps/testdir/alpha/omega/omega_first b/test/apps/testdir/alpha/omega/omega_first
new file mode 100644
index 0000000..e594737
--- /dev/null
+++ b/test/apps/testdir/alpha/omega/omega_first
@@ -0,0 +1,14 @@
+Neque gravida in fermentum et sollicitudin.
+Sodales ut eu sem integer vitae justo eget magna fermentum.
+Amet nulla facilisi morbi tempus iaculis.
+Proin sagittis nisl rhoncus mattis rhoncus urna neque.
+Aliquam sem fringilla ut morbi tincidunt augue interdum velit euismod.
+Sagittis eu volutpat odio facilisis.
+Aliquet porttitor lacus luctus accumsan tortor posuere ac ut.
+Sed arcu non odio euismod lacinia.
+Faucibus et molestie ac feugiat.
+Urna neque viverra justo nec ultrices dui sapien eget.
+Amet commodo nulla facilisi nullam.
+Pretium lectus quam id leo in vitae.
+A cras semper auctor neque.
+Sed arcu non odio euismod lacinia at quis risus sed.
diff --git a/test/apps/testdir/alpha/omega/omega_second b/test/apps/testdir/alpha/omega/omega_second
new file mode 100644
index 0000000..0c9857d
--- /dev/null
+++ b/test/apps/testdir/alpha/omega/omega_second
@@ -0,0 +1,10 @@
+Commodo sed egestas egestas fringilla phasellus.
+Ac tortor dignissim convallis aenean et tortor at risus.
+Lorem dolor sed viverra ipsum nunc aliquet bibendum enim.
+Quis lectus nulla at volutpat diam ut.
+Tincidunt id aliquet risus feugiat in ante metus.
+Tincidunt arcu non sodales neque.
+Amet est placerat in egestas erat imperdiet sed euismod.
+Duis tristique sollicitudin nibh sit amet.
+Sed arcu non odio euismod lacinia at.
+Ullamcorper morbi tincidunt ornare massa eget egestas purus viverra accumsan.
diff --git a/test/apps/testdir/foo/bar/bar_first b/test/apps/testdir/foo/bar/bar_first
new file mode 100644
index 0000000..e9edb3f
--- /dev/null
+++ b/test/apps/testdir/foo/bar/bar_first
@@ -0,0 +1,20 @@
+Elementum eu facilisis sed odio morbi quis commodo.
+Enim nunc faucibus a pellentesque sit amet porttitor.
+Etiam non quam lacus suspendisse faucibus interdum.
+Viverra aliquet eget sit amet tellus.
+Arcu vitae elementum curabitur vitae.
+Feugiat vivamus at augue eget arcu dictum.
+Commodo quis imperdiet massa tincidunt nunc.
+Urna duis convallis convallis tellus id interdum.
+Commodo sed egestas egestas fringilla phasellus.
+Sodales neque sodales ut etiam sit amet nisl.
+Sem integer vitae justo eget magna fermentum iaculis.
+Id diam maecenas ultricies mi.
+Aliquet nibh praesent tristique magna sit amet purus gravida.
+Ut enim blandit volutpat maecenas volutpat.
+Ipsum a arcu cursus vitae congue mauris.
+Donec ultrices tincidunt arcu non.
+Nulla posuere sollicitudin aliquam ultrices sagittis orci a scelerisque purus.
+Egestas maecenas pharetra convallis posuere.
+Feugiat in fermentum posuere urna nec.
+Nulla malesuada pellentesque elit eget gravida cum sociis.
diff --git a/test/apps/testdir/foo/bar/bar_second b/test/apps/testdir/foo/bar/bar_second
new file mode 100644
index 0000000..fb95896
--- /dev/null
+++ b/test/apps/testdir/foo/bar/bar_second
@@ -0,0 +1,14 @@
+Augue ut lectus arcu bibendum at varius vel pharetra vel.
+Mattis aliquam faucibus purus in.
+In tellus integer feugiat scelerisque.
+Eget velit aliquet sagittis id consectetur purus ut faucibus pulvinar.
+Augue mauris augue neque gravida.
+Pulvinar neque laoreet suspendisse interdum consectetur libero id faucibus.
+Tellus elementum sagittis vitae et leo duis.
+Eget est lorem ipsum dolor sit amet consectetur.
+Volutpat sed cras ornare arcu.
+Faucibus a pellentesque sit amet.
+Turpis egestas maecenas pharetra convallis.
+Faucibus interdum posuere lorem ipsum dolor sit amet.
+Id semper risus in hendrerit.
+Amet volutpat consequat mauris nunc.
diff --git a/test/apps/testdir/foo/foo_first b/test/apps/testdir/foo/foo_first
new file mode 100644
index 0000000..ed1e052
--- /dev/null
+++ b/test/apps/testdir/foo/foo_first
@@ -0,0 +1,11 @@
+Venenatis tellus in metus vulputate eu scelerisque felis imperdiet proin.
+Orci phasellus egestas tellus rutrum.
+Feugiat vivamus at augue eget arcu dictum varius.
+Justo eget magna fermentum iaculis eu non.
+A erat nam at lectus urna duis.
+Quam quisque id diam vel quam elementum pulvinar etiam.
+Amet commodo nulla facilisi nullam vehicula ipsum a.
+Sapien faucibus et molestie ac feugiat.
+Aliquam vestibulum morbi blandit cursus risus at ultrices.
+Purus faucibus ornare suspendisse sed nisi.
+In massa tempor nec feugiat nisl pretium fusce id velit.
diff --git a/test/apps/testdir/foo/foo_second b/test/apps/testdir/foo/foo_second
new file mode 100644
index 0000000..95e46ae
--- /dev/null
+++ b/test/apps/testdir/foo/foo_second
@@ -0,0 +1,12 @@
+Sit amet consectetur adipiscing elit ut aliquam purus.
+Vulputate dignissim suspendisse in est ante in nibh.
+Eu feugiat pretium nibh ipsum consequat nisl vel pretium.
+Egestas purus viverra accumsan in nisl.
+Ac odio tempor orci dapibus ultrices.
+At imperdiet dui accumsan sit amet.
+Elementum integer enim neque volutpat ac tincidunt vitae semper.
+Mi in nulla posuere sollicitudin aliquam ultrices sagittis.
+Aliquam sem et tortor consequat.
+Tristique senectus et netus et malesuada fames ac turpis.
+Quis hendrerit dolor magna eget est lorem ipsum.
+Ut consequat semper viverra nam libero.
diff --git a/test/apps/testdir/td_first b/test/apps/testdir/td_first
new file mode 100644
index 0000000..21eab9c
--- /dev/null
+++ b/test/apps/testdir/td_first
@@ -0,0 +1,18 @@
+Urna duis convallis convallis tellus id interdum velit.
+Risus in hendrerit gravida rutrum.
+Odio pellentesque diam volutpat commodo sed.
+Duis convallis convallis tellus id interdum velit laoreet id donec.
+Duis at tellus at urna.
+Egestas maecenas pharetra convallis posuere morbi leo urna molestie at.
+Et leo duis ut diam quam nulla porttitor massa id.
+Nunc eget lorem dolor sed viverra ipsum nunc aliquet bibendum.
+Sodales ut etiam sit amet nisl purus in.
+Ac felis donec et odio pellentesque diam volutpat commodo.
+Nunc mi ipsum faucibus vitae aliquet.
+Volutpat ac tincidunt vitae semper quis lectus nulla at volutpat.
+Mollis aliquam ut porttitor leo.
+Vestibulum rhoncus est pellentesque elit ullamcorper dignissim cras.
+Pulvinar proin gravida hendrerit lectus a.
+Ultrices dui sapien eget mi proin.
+Dui vivamus arcu felis bibendum ut.
+Aliquam eleifend mi in nulla posuere sollicitudin aliquam ultrices sagittis.
diff --git a/test/apps/testdir/td_second b/test/apps/testdir/td_second
new file mode 100644
index 0000000..496ff0e
--- /dev/null
+++ b/test/apps/testdir/td_second
@@ -0,0 +1,14 @@
+Egestas purus viverra accumsan in.
+Auctor urna nunc id cursus metus aliquam eleifend.
+Morbi tincidunt augue interdum velit.
+In egestas erat imperdiet sed euismod nisi porta lorem mollis.
+Sed augue lacus viverra vitae congue eu consequat.
+Ut pharetra sit amet aliquam id.
+Aenean euismod elementum nisi quis eleifend.
+Hac habitasse platea dictumst vestibulum rhoncus est pellentesque elit ullamcorper.
+Eget nunc lobortis mattis aliquam faucibus purus.
+Sit amet luctus venenatis lectus magna fringilla.
+Placerat orci nulla pellentesque dignissim enim sit amet venenatis.
+Montes nascetur ridiculus mus mauris.
+Morbi enim nunc faucibus a pellentesque sit amet.
+Et netus et malesuada fames ac turpis egestas.
diff --git a/test/apps/testfile b/test/apps/testfile
new file mode 100644
index 0000000..b56928e
--- /dev/null
+++ b/test/apps/testfile
@@ -0,0 +1,16 @@
+Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
+Fames ac turpis egestas maecenas pharetra convallis posuere morbi.
+Etiam erat velit scelerisque in dictum non consectetur a erat.
+Dolor purus non enim praesent elementum facilisis.
+Nulla porttitor massa id neque aliquam vestibulum morbi blandit cursus.
+Adipiscing vitae proin sagittis nisl rhoncus mattis rhoncus urna neque.
+Aenean pharetra magna ac placerat.
+Pulvinar proin gravida hendrerit lectus a.
+Iaculis nunc sed augue lacus viverra vitae.
+Ac tortor vitae purus faucibus ornare suspendisse sed.
+Purus in mollis nunc sed id semper.
+Non consectetur a erat nam at lectus urna.
+In ante metus dictum at tempor commodo ullamcorper.
+Auctor augue mauris augue neque gravida in fermentum.
+Nunc scelerisque viverra mauris in.
+Morbi leo urna molestie at elementum.
diff --git a/test/base/__init__.py b/test/base/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/base/__init__.py
diff --git a/test/base/test_extractor.py b/test/base/test_extractor.py
new file mode 100644
index 0000000..30974ef
--- /dev/null
+++ b/test/base/test_extractor.py
@@ -0,0 +1,70 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import unittest
+
+# bsie imports
+from bsie.utils import bsfs, ns
+
+# objects to test
+from bsie.base import extractor
+
+
+## code ##
+
+class StubExtractor(extractor.Extractor):
+ def __init__(self):
+ super().__init__(bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "false"^^xsd:boolean .
+ bse:comment rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "false"^^xsd:boolean .
+ '''))
+
+ def extract(self, subject, content, predicates):
+ raise NotImplementedError()
+
+class StubSub(StubExtractor):
+ pass
+
+class TestExtractor(unittest.TestCase):
+ def test_essentials(self):
+ ext = StubExtractor()
+ self.assertEqual(str(ext), 'StubExtractor')
+ self.assertEqual(repr(ext), 'StubExtractor()')
+ self.assertEqual(ext, StubExtractor())
+ self.assertEqual(hash(ext), hash(StubExtractor()))
+
+ sub = StubSub()
+ self.assertEqual(str(sub), 'StubSub')
+ self.assertEqual(repr(sub), 'StubSub()')
+ self.assertEqual(sub, StubSub())
+ self.assertEqual(hash(sub), hash(StubSub()))
+ self.assertNotEqual(ext, sub)
+ self.assertNotEqual(hash(ext), hash(sub))
+
+ def test_principals(self):
+ schema = bsfs.schema.Schema.Empty()
+ entity = schema.node(ns.bsfs.Node).get_child(ns.bsfs.Entity)
+ string = schema.literal(ns.bsfs.Literal).get_child(bsfs.URI('http://www.w3.org/2001/XMLSchema#string'))
+ p_author = schema.predicate(ns.bsfs.Predicate).get_child(ns.bse.author, domain=entity, range=string)
+ p_comment = schema.predicate(ns.bsfs.Predicate).get_child(ns.bse.comment, domain=entity, range=string)
+ ext = StubExtractor()
+ self.assertSetEqual(set(ext.principals),
+ {p_author, p_comment} | set(schema.predicates()) - {schema.predicate(ns.bsfs.Predicate)})
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/base/test_reader.py b/test/base/test_reader.py
new file mode 100644
index 0000000..a907eb9
--- /dev/null
+++ b/test/base/test_reader.py
@@ -0,0 +1,45 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import unittest
+
+# objects to test
+from bsie import base
+
+
+## code ##
+
+class StubReader(base.Reader):
+ def __call__(self, path):
+ raise NotImplementedError()
+
+class StubSub(StubReader):
+ pass
+
+class TestReader(unittest.TestCase):
+ def test_essentials(self):
+ ext = StubReader()
+ self.assertEqual(str(ext), 'StubReader')
+ self.assertEqual(repr(ext), 'StubReader()')
+ self.assertEqual(ext, StubReader())
+ self.assertEqual(hash(ext), hash(StubReader()))
+
+ sub = StubSub()
+ self.assertEqual(str(sub), 'StubSub')
+ self.assertEqual(repr(sub), 'StubSub()')
+ self.assertEqual(sub, StubSub())
+ self.assertEqual(hash(sub), hash(StubSub()))
+ self.assertNotEqual(ext, sub)
+ self.assertNotEqual(hash(ext), hash(sub))
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/extractor/__init__.py b/test/extractor/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/extractor/__init__.py
diff --git a/test/extractor/generic/__init__.py b/test/extractor/generic/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/extractor/generic/__init__.py
diff --git a/test/extractor/generic/test_constant.py b/test/extractor/generic/test_constant.py
new file mode 100644
index 0000000..9dbaced
--- /dev/null
+++ b/test/extractor/generic/test_constant.py
@@ -0,0 +1,124 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import unittest
+
+# bsie imports
+from bsie.utils import node as _node, ns
+
+# objects to test
+from bsie.extractor.generic.constant import Constant
+
+
+## code ##
+
+class TestConstant(unittest.TestCase):
+ def test_extract(self):
+ schema = '''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ bse:comment rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "false"^^xsd:boolean .
+ '''
+ tuples = [
+ (ns.bse.author, 'Me, myself, and I'),
+ (ns.bse.comment, 'the quick brown fox jumps over the lazy dog.'),
+ ]
+ ext = Constant(schema, tuples)
+ node = _node.Node(ns.bsfs.Entity, '') # Blank node
+ p_author = ext.schema.predicate(ns.bse.author)
+ p_comment = ext.schema.predicate(ns.bse.comment)
+ entity = ext.schema.node(ns.bsfs.Node).get_child(ns.bsfs.Entity)
+ string = ext.schema.literal(ns.bsfs.Literal).get_child(ns.xsd.string)
+ # baseline
+ self.assertSetEqual(set(ext.extract(node, None, (p_author, p_comment))),
+ {(node, p_author, 'Me, myself, and I'),
+ (node, p_comment, 'the quick brown fox jumps over the lazy dog.')})
+ # predicates is respected
+ p_foobar = ext.schema.predicate(ns.bsfs.Predicate).get_child(ns.bse.foobar, domain=entity, range=entity)
+ self.assertSetEqual(set(ext.extract(node, None, (p_author, p_foobar))),
+ {(node, p_author, 'Me, myself, and I')})
+ self.assertSetEqual(set(ext.extract(node, None, (p_comment, p_foobar))),
+ {(node, p_comment, 'the quick brown fox jumps over the lazy dog.')})
+ p_barfoo = ext.schema.predicate(ns.bse.author).get_child(ns.bse.comment, domain=entity, range=string)
+ self.assertSetEqual(set(ext.extract(node, None, (p_foobar, p_barfoo))), set())
+
+ def test_construct(self):
+ # schema compliance
+ schema = '''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ bse:comment rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "false"^^xsd:boolean .
+ '''
+ # can create a schema
+ self.assertIsInstance(Constant(schema, [
+ (ns.bse.author, 'Me, myself, and I'),
+ (ns.bse.comment, 'the quick brown fox jumps over the lazy dog.'),
+ ]), Constant)
+ # predicates are validated
+ self.assertRaises(KeyError, Constant, schema, [
+ (ns.bse.author, 'Me, myself, and I'),
+ (ns.bse.foobar, 'foobar!')])
+ # FIXME: values are validated
+ #class Foo(): pass # not string compatible
+ #self.assertRaises(ValueError, Constant, schema, [
+ # (ns.bse.author, Foo())])
+
+ def test_eq(self):
+ schema_a = '''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ '''
+ schema_b = '''
+ bse:comment rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "false"^^xsd:boolean .
+ '''
+ tuples_a = [(ns.bse.author, 'Me, myself, and I')]
+ tuples_b = [(ns.bse.comment, 'the quick brown fox jumps over the lazy dog.') ]
+ # distinct instances, same data
+ self.assertEqual(
+ Constant(schema_a, tuples_a),
+ Constant(schema_a, tuples_a))
+ self.assertEqual(
+ hash(Constant(schema_a, tuples_a)),
+ hash(Constant(schema_a, tuples_a)))
+ # different data
+ self.assertNotEqual(
+ Constant(schema_a, tuples_a),
+ Constant(schema_b, tuples_b))
+ self.assertNotEqual(
+ hash(Constant(schema_a, tuples_a)),
+ hash(Constant(schema_b, tuples_b)))
+ # different objects
+ class Foo(): pass
+ self.assertNotEqual(Constant(schema_a, tuples_a), Foo())
+ self.assertNotEqual(hash(Constant(schema_a, tuples_a)), hash(Foo()))
+ self.assertNotEqual(Constant(schema_a, tuples_a), 123)
+ self.assertNotEqual(hash(Constant(schema_a, tuples_a)), hash(123))
+ self.assertNotEqual(Constant(schema_a, tuples_a), None)
+ self.assertNotEqual(hash(Constant(schema_a, tuples_a)), hash(None))
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/extractor/generic/test_path.py b/test/extractor/generic/test_path.py
new file mode 100644
index 0000000..820f402
--- /dev/null
+++ b/test/extractor/generic/test_path.py
@@ -0,0 +1,74 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import unittest
+
+# bsie imports
+from bsie.base import extractor
+from bsie.utils import bsfs, node as _node, ns
+
+# objects to test
+from bsie.extractor.generic.path import Path
+
+
+## code ##
+
+class TestPath(unittest.TestCase):
+ def test_eq(self):
+ # distinct instances, same data
+ self.assertEqual(Path(), Path())
+ # different classes
+ class Foo(): pass
+ self.assertNotEqual(Path(), Foo())
+ self.assertNotEqual(Path(), 123)
+ self.assertNotEqual(Path(), None)
+
+ def test_schema(self):
+ self.assertEqual(Path().schema,
+ bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:filename rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range xsd:string ;
+ bsfs:unique "false"^^xsd:boolean .
+ '''))
+
+ def test_extract(self):
+ ext = Path()
+ node = _node.Node(ns.bsfs.File, '') # Blank node
+ content = '/tmp/foo/bar'
+ p_filename = ext.schema.predicate(ns.bse.filename)
+ entity = ext.schema.node(ns.bsfs.Node).get_child(ns.bsfs.Entity)
+ string = ext.schema.literal(ns.bsfs.Literal).get_child(ns.xsd.string)
+
+ # baseline
+ self.assertSetEqual(set(ext.extract(node, content, (p_filename, ))),
+ {(node, p_filename, 'bar')})
+ # predicates parameter is respected
+ p_foo = ext.schema.predicate(ns.bsfs.Predicate).get_child(ns.bse.foo, domain=entity, range=string) # unsupported predicate
+ self.assertSetEqual(set(ext.extract(node, content, (p_filename, p_foo))),
+ {(node, p_filename, 'bar')})
+ self.assertSetEqual(set(ext.extract(node, content, (p_foo, ))), set())
+ # predicates are validated
+ p_bar = p_foo.get_child(ns.bse.filename) # same URI but different hierarchy
+ self.assertSetEqual(set(ext.extract(node, content, (p_filename, p_bar))),
+ {(node, p_filename, 'bar')})
+ self.assertSetEqual(set(ext.extract(node, content, (p_bar, ))), set())
+ # path variations
+ self.assertSetEqual(set(ext.extract(node, 'bar', (p_filename, ))),
+ {(node, p_filename, 'bar')})
+ self.assertSetEqual(set(ext.extract(node, '', (p_filename, ))),
+ {(node, p_filename, '')})
+ # errors are suppressed
+ self.assertSetEqual(set(ext.extract(node, None, (p_filename, ))), set())
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/extractor/generic/test_stat.py b/test/extractor/generic/test_stat.py
new file mode 100644
index 0000000..3441438
--- /dev/null
+++ b/test/extractor/generic/test_stat.py
@@ -0,0 +1,73 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import os
+import unittest
+
+# bsie imports
+from bsie.base import extractor
+from bsie.utils import bsfs, node as _node, ns
+
+# objects to test
+from bsie.extractor.generic.stat import Stat
+
+
+## code ##
+
+class TestStat(unittest.TestCase):
+ def test_eq(self):
+ # distinct instances, same data
+ self.assertEqual(Stat(), Stat())
+ # different classes
+ class Foo(): pass
+ self.assertNotEqual(Stat(), Foo())
+ self.assertNotEqual(Stat(), 123)
+ self.assertNotEqual(Stat(), None)
+
+ def test_schema(self):
+ self.assertEqual(Stat().schema,
+ bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:filesize rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "false"^^xsd:boolean .
+ '''))
+
+ def test_extract(self):
+ ext = Stat()
+ node = _node.Node(ns.bsfs.File, '') # Blank node
+ content = os.stat(__file__)
+ p_filesize = ext.schema.predicate(ns.bse.filesize)
+ entity = ext.schema.node(ns.bsfs.Node).get_child(ns.bsfs.Entity)
+ string = ext.schema.literal(ns.bsfs.Literal).get_child(ns.xsd.string)
+
+ # baseline
+ self.assertSetEqual(set(ext.extract(node, content, (p_filesize, ))),
+ {(node, p_filesize, content.st_size)})
+ # predicates parameter is respected
+ p_foo = ext.schema.predicate(ns.bsfs.Predicate).get_child(ns.bse.foo, domain=entity, range=string) # unsupported predicate
+ self.assertSetEqual(set(ext.extract(node, content, (p_filesize, p_foo))),
+ {(node, p_filesize, content.st_size)})
+ self.assertSetEqual(set(ext.extract(node, content, (p_foo, ))), set())
+ # predicates are validated
+ p_bar = p_foo.get_child(ns.bse.filesizse) # same URI but different hierarchy
+ self.assertSetEqual(set(ext.extract(node, content, (p_filesize, p_bar))),
+ {(node, p_filesize, content.st_size)})
+ self.assertSetEqual(set(ext.extract(node, content, (p_bar, ))), set())
+ # content variations
+ self.assertSetEqual(set(ext.extract(node, os.stat_result([12345] * len(content)), (p_filesize, p_bar))),
+ {(node, p_filesize, 12345)})
+ # errors are suppressed
+ self.assertSetEqual(set(ext.extract(node, None, (p_filesize, ))), set())
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/lib/__init__.py b/test/lib/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/lib/__init__.py
diff --git a/test/lib/test_bsie.py b/test/lib/test_bsie.py
new file mode 100644
index 0000000..771a0c2
--- /dev/null
+++ b/test/lib/test_bsie.py
@@ -0,0 +1,179 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import os
+import unittest
+
+# bsie imports
+from bsie.base import extractor
+from bsie.tools import builder
+from bsie.utils import bsfs, node, ns
+
+# objects to test
+from bsie.lib.bsie import BSIE
+
+
+## code ##
+
+class TestBSIE(unittest.TestCase):
+ def setUp(self):
+ # reader builder
+ rbuild = builder.ReaderBuilder({})
+ # extractor builder
+ ebuild = builder.ExtractorBuilder([
+ {'bsie.extractor.generic.path.Path': {}},
+ {'bsie.extractor.generic.stat.Stat': {}},
+ {'bsie.extractor.generic.constant.Constant': dict(
+ tuples=[('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')],
+ schema='''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ ''',
+ )},
+ ])
+ # build pipeline
+ self.prefix = bsfs.Namespace('http://example.com/local/')
+ pbuild = builder.PipelineBuilder(self.prefix, rbuild, ebuild)
+ self.pipeline = pbuild.build()
+
+ def test_construction(self):
+ # pipeline only
+ lib = BSIE(self.pipeline)
+ self.assertSetEqual(set(lib.principals), {
+ ns.bse.filename,
+ ns.bse.filesize,
+ ns.bse.author,
+ })
+ self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:filename rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range xsd:string ;
+ bsfs:unique "false"^^xsd:boolean .
+
+ bse:filesize rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range xsd:integer;
+ bsfs:unique "false"^^xsd:boolean .
+
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ '''))
+
+ # specify collect
+ lib = BSIE(self.pipeline, collect={
+ ns.bse.filesize,
+ ns.bse.author,
+ ns.bse.inexistent,
+ })
+ self.assertSetEqual(set(lib.principals), {
+ ns.bse.filesize,
+ ns.bse.author,
+ })
+ self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:filesize rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range xsd:integer;
+ bsfs:unique "false"^^xsd:boolean .
+
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ '''))
+ # empty collect is disregarded
+ lib = BSIE(self.pipeline, collect={})
+ self.assertSetEqual(set(lib.principals), {
+ ns.bse.filename,
+ ns.bse.filesize,
+ ns.bse.author,
+ })
+ self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:filename rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range xsd:string ;
+ bsfs:unique "false"^^xsd:boolean .
+
+ bse:filesize rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range xsd:integer;
+ bsfs:unique "false"^^xsd:boolean .
+
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+
+ '''))
+
+ # specify discard
+ lib = BSIE(self.pipeline, discard={
+ ns.bse.filesize,
+ ns.bse.filename,
+ ns.bse.inexistent,
+ })
+ self.assertSetEqual(set(lib.principals), {
+ ns.bse.author,
+ })
+ self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ '''))
+
+ # specify collect and discard
+ lib = BSIE(self.pipeline,
+ collect={ns.bse.filesize, ns.bse.author, ns.bse.foo, ns.bse.bar},
+ discard={ns.bse.author, ns.bse.foo, ns.bse.foobar},
+ )
+ self.assertSetEqual(set(lib.principals), {
+ ns.bse.filesize,
+ })
+ self.assertEqual(lib.schema, bsfs.schema.Schema.from_string(extractor.SCHEMA_PREAMBLE + '''
+ bse:filesize rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range xsd:integer;
+ bsfs:unique "false"^^xsd:boolean .
+
+ '''))
+
+
+ def test_from_file(self):
+ # setup
+ lib = BSIE(self.pipeline)
+ self.assertSetEqual(set(lib.principals), {
+ ns.bse.filesize,
+ ns.bse.filename,
+ ns.bse.author,
+ })
+ content_hash = 'a948904f2f0f479b8f8197694b30184b0d2ed1c1cd2a1ec0fb85d299a192a447'
+ subject = node.Node(ns.bsfs.File, (self.prefix + 'file#')[content_hash])
+ testfile = os.path.join(os.path.dirname(__file__), 'testfile.t')
+
+ # from_file extracts all available triples
+ self.assertSetEqual(set(lib.from_file(testfile)), {
+ (subject, lib.schema.predicate(ns.bse.filename), 'testfile.t'),
+ (subject, lib.schema.predicate(ns.bse.filesize), 12),
+ (subject, lib.schema.predicate(ns.bse.author), 'Me, myself, and I'),
+ })
+
+ # from_file respects predicate argument
+ self.assertSetEqual(set(lib.from_file(testfile, {ns.bse.filename, ns.bse.invalid})), {
+ (subject, lib.schema.predicate(ns.bse.filename), 'testfile.t'),
+ })
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/lib/testfile.t b/test/lib/testfile.t
new file mode 100644
index 0000000..3b18e51
--- /dev/null
+++ b/test/lib/testfile.t
@@ -0,0 +1 @@
+hello world
diff --git a/test/reader/__init__.py b/test/reader/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/reader/__init__.py
diff --git a/test/reader/test_path.py b/test/reader/test_path.py
new file mode 100644
index 0000000..fd7bc5a
--- /dev/null
+++ b/test/reader/test_path.py
@@ -0,0 +1,28 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import unittest
+
+# objects to test
+from bsie.reader.path import Path
+
+
+## code ##
+
+class TestPath(unittest.TestCase):
+ def test_call(self):
+ self.assertEqual('', Path()(''))
+ self.assertEqual('/tmp/foo/bar', Path()('/tmp/foo/bar'))
+ self.assertEqual('/home/myself/some file', Path()('/home/myself/some file'))
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/reader/test_stat.py b/test/reader/test_stat.py
new file mode 100644
index 0000000..d12ad9c
--- /dev/null
+++ b/test/reader/test_stat.py
@@ -0,0 +1,34 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import os
+import unittest
+
+# bsie imports
+from bsie.base import errors
+
+# objects to test
+from bsie.reader.stat import Stat
+
+
+## code ##
+
+class TestPath(unittest.TestCase):
+ def test_call(self):
+ # test self
+ self.assertEqual(os.stat(__file__), Stat()(__file__))
+ # test invalid file
+ self.assertRaises(errors.ReaderError, Stat(), '')
+ self.assertRaises(errors.ReaderError, Stat(), None)
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/tools/__init__.py b/test/tools/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/tools/__init__.py
diff --git a/test/tools/test_builder.py b/test/tools/test_builder.py
new file mode 100644
index 0000000..62c637c
--- /dev/null
+++ b/test/tools/test_builder.py
@@ -0,0 +1,246 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import logging
+import unittest
+
+# bsie imports
+from bsie import base
+from bsie.utils import bsfs
+
+# objects to test
+from bsie.tools.builder import ExtractorBuilder
+from bsie.tools.builder import PipelineBuilder
+from bsie.tools.builder import ReaderBuilder
+from bsie.tools.builder import _safe_load
+from bsie.tools.builder import _unpack_name
+
+
+## code ##
+
+class TestUtils(unittest.TestCase):
+ def test_safe_load(self):
+ # invalid module
+ self.assertRaises(base.errors.LoaderError, _safe_load, 'dBGHMSAYOoKeKMpywDoKZQycENFPvN', 'foobar')
+ self.assertRaises(base.errors.LoaderError, _safe_load, 'dBGHMSAYOoKeKMpywDoKZQycENFPvN.bar', 'foobar')
+ # partially valid module
+ self.assertRaises(base.errors.LoaderError, _safe_load, 'os.foo', 'foobar')
+ # invalid class
+ self.assertRaises(base.errors.LoaderError, _safe_load, 'os.path', 'foo')
+ # valid module and class
+ cls = _safe_load('collections.abc', 'Container')
+ import collections.abc
+ self.assertEqual(cls, collections.abc.Container)
+
+ def test_unpack_name(self):
+ self.assertRaises(TypeError, _unpack_name, 123)
+ self.assertRaises(TypeError, _unpack_name, None)
+ self.assertRaises(ValueError, _unpack_name, '')
+ self.assertRaises(ValueError, _unpack_name, 'path')
+ self.assertRaises(ValueError, _unpack_name, '.Path')
+ self.assertEqual(_unpack_name('path.Path'), ('path', 'Path'))
+ self.assertEqual(_unpack_name('path.foo.bar.Path'), ('path.foo.bar', 'Path'))
+
+
+class TestReaderBuilder(unittest.TestCase):
+ def test_build(self):
+ builder = ReaderBuilder({'bsie.reader.path.Path': {}})
+ # build configured reader
+ cls = builder.build('bsie.reader.path.Path')
+ import bsie.reader.path
+ self.assertIsInstance(cls, bsie.reader.path.Path)
+ # build unconfigured reader
+ cls = builder.build('bsie.reader.stat.Stat')
+ import bsie.reader.stat
+ self.assertIsInstance(cls, bsie.reader.stat.Stat)
+ # re-build previous reader (test cache)
+ self.assertEqual(cls, builder.build('bsie.reader.stat.Stat'))
+ # test invalid
+ self.assertRaises(TypeError, builder.build, 123)
+ self.assertRaises(TypeError, builder.build, None)
+ self.assertRaises(ValueError, builder.build, '')
+ self.assertRaises(ValueError, builder.build, 'Path')
+ self.assertRaises(base.errors.BuilderError, builder.build, 'path.Path')
+ # invalid config
+ builder = ReaderBuilder({'bsie.reader.stat.Stat': dict(foo=123)})
+ self.assertRaises(base.errors.BuilderError, builder.build, 'bsie.reader.stat.Stat')
+ builder = ReaderBuilder({'bsie.reader.stat.Stat': 123})
+ self.assertRaises(TypeError, builder.build, 'bsie.reader.stat.Stat')
+ # no instructions
+ builder = ReaderBuilder({})
+ cls = builder.build('bsie.reader.stat.Stat')
+ self.assertIsInstance(cls, bsie.reader.stat.Stat)
+
+
+
+class TestExtractorBuilder(unittest.TestCase):
+ def test_iter(self):
+ # no specifications
+ self.assertListEqual(list(ExtractorBuilder([])), [])
+ # some specifications
+ builder = ExtractorBuilder([
+ {'bsie.extractor.generic.path.Path': {}},
+ {'bsie.extractor.generic.stat.Stat': {}},
+ {'bsie.extractor.generic.path.Path': {}},
+ ])
+ self.assertListEqual(list(builder), [0, 1, 2])
+
+ def test_build(self):
+ # simple and repeated extractors
+ builder = ExtractorBuilder([
+ {'bsie.extractor.generic.path.Path': {}},
+ {'bsie.extractor.generic.stat.Stat': {}},
+ {'bsie.extractor.generic.path.Path': {}},
+ ])
+ ext = [builder.build(0), builder.build(1), builder.build(2)]
+ import bsie.extractor.generic.path
+ import bsie.extractor.generic.stat
+ self.assertListEqual(ext, [
+ bsie.extractor.generic.path.Path(),
+ bsie.extractor.generic.stat.Stat(),
+ bsie.extractor.generic.path.Path(),
+ ])
+ # out-of-bounds raises KeyError
+ self.assertRaises(IndexError, builder.build, 3)
+
+ # building with args
+ builder = ExtractorBuilder([
+ {'bsie.extractor.generic.constant.Constant': {
+ 'schema': '''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ bse:rating rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+ ''',
+ 'tuples': [
+ ('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I'),
+ ('http://bsfs.ai/schema/Entity#rating', 123),
+ ],
+ }}])
+ obj = builder.build(0)
+ import bsie.extractor.generic.constant
+ self.assertEqual(obj, bsie.extractor.generic.constant.Constant('''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ bse:rating rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+ ''', [
+ ('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I'),
+ ('http://bsfs.ai/schema/Entity#rating', 123),
+ ]))
+
+ # building with invalid args
+ self.assertRaises(base.errors.BuilderError, ExtractorBuilder(
+ [{'bsie.extractor.generic.path.Path': {'foo': 123}}]).build, 0)
+ # non-dict build specification
+ self.assertRaises(TypeError, ExtractorBuilder(
+ [('bsie.extractor.generic.path.Path', {})]).build, 0)
+ # multiple keys per build specification
+ self.assertRaises(TypeError, ExtractorBuilder(
+ [{'bsie.extractor.generic.path.Path': {},
+ 'bsie.extractor.generic.stat.Stat': {}}]).build, 0)
+ # non-dict value for kwargs
+ self.assertRaises(TypeError, ExtractorBuilder(
+ [{'bsie.extractor.generic.path.Path': 123}]).build, 0)
+
+
+
+
+class TestPipelineBuilder(unittest.TestCase):
+ def test_build(self):
+ prefix = bsfs.URI('http://example.com/local/file#')
+ c_schema = '''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ '''
+ c_tuples = [('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')]
+ # prepare builders
+ rbuild = ReaderBuilder({})
+ ebuild = ExtractorBuilder([
+ {'bsie.extractor.generic.path.Path': {}},
+ {'bsie.extractor.generic.stat.Stat': {}},
+ {'bsie.extractor.generic.constant.Constant': dict(
+ schema=c_schema,
+ tuples=c_tuples,
+ )},
+ ])
+ # build pipeline
+ builder = PipelineBuilder(prefix, rbuild, ebuild)
+ pipeline = builder.build()
+ # delayed import
+ import bsie.reader.path
+ import bsie.reader.stat
+ import bsie.extractor.generic.path
+ import bsie.extractor.generic.stat
+ import bsie.extractor.generic.constant
+ # check pipeline
+ self.assertDictEqual(pipeline._ext2rdr, {
+ bsie.extractor.generic.path.Path(): bsie.reader.path.Path(),
+ bsie.extractor.generic.stat.Stat(): bsie.reader.stat.Stat(),
+ bsie.extractor.generic.constant.Constant(c_schema, c_tuples): None,
+ })
+
+ # fail to load extractor
+ ebuild_err = ExtractorBuilder([
+ {'bsie.extractor.generic.foo.Foo': {}},
+ {'bsie.extractor.generic.path.Path': {}},
+ ])
+ with self.assertLogs(logging.getLogger('bsie.tools.builder'), logging.ERROR):
+ pipeline = PipelineBuilder(prefix, rbuild, ebuild_err).build()
+ self.assertDictEqual(pipeline._ext2rdr, {
+ bsie.extractor.generic.path.Path(): bsie.reader.path.Path()})
+
+ # fail to build extractor
+ ebuild_err = ExtractorBuilder([
+ {'bsie.extractor.generic.path.Path': {'foo': 123}},
+ {'bsie.extractor.generic.path.Path': {}},
+ ])
+ with self.assertLogs(logging.getLogger('bsie.tools.builder'), logging.ERROR):
+ pipeline = PipelineBuilder(prefix, rbuild, ebuild_err).build()
+ self.assertDictEqual(pipeline._ext2rdr, {
+ bsie.extractor.generic.path.Path(): bsie.reader.path.Path()})
+
+ # fail to load reader
+ with self.assertLogs(logging.getLogger('bsie.tools.builder'), logging.ERROR):
+ # switch reader of an extractor
+ old_reader = bsie.extractor.generic.path.Path.CONTENT_READER
+ bsie.extractor.generic.path.Path.CONTENT_READER = 'bsie.reader.foo.Foo'
+ # build pipeline with invalid reader reference
+ pipeline = PipelineBuilder(prefix, rbuild, ebuild).build()
+ self.assertDictEqual(pipeline._ext2rdr, {
+ bsie.extractor.generic.stat.Stat(): bsie.reader.stat.Stat(),
+ bsie.extractor.generic.constant.Constant(c_schema, c_tuples): None,
+ })
+ # switch back
+ bsie.extractor.generic.path.Path.CONTENT_READER = old_reader
+
+ # fail to build reader
+ rbuild_err = ReaderBuilder({'bsie.reader.stat.Stat': dict(foo=123)})
+ with self.assertLogs(logging.getLogger('bsie.tools.builder'), logging.ERROR):
+ pipeline = PipelineBuilder(prefix, rbuild_err, ebuild).build()
+ self.assertDictEqual(pipeline._ext2rdr, {
+ bsie.extractor.generic.path.Path(): bsie.reader.path.Path(),
+ bsie.extractor.generic.constant.Constant(c_schema, c_tuples): None,
+ })
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/tools/test_pipeline.py b/test/tools/test_pipeline.py
new file mode 100644
index 0000000..a116a30
--- /dev/null
+++ b/test/tools/test_pipeline.py
@@ -0,0 +1,176 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import logging
+import os
+import unittest
+
+# bsie imports
+from bsie.base import errors
+from bsie.utils import bsfs, node, ns
+import bsie.extractor.generic.constant
+import bsie.extractor.generic.path
+import bsie.extractor.generic.stat
+import bsie.reader.path
+import bsie.reader.stat
+
+# objects to test
+from bsie.tools.pipeline import Pipeline
+
+
+## code ##
+
+class TestPipeline(unittest.TestCase):
+ def setUp(self):
+ # constant A
+ csA = '''
+ bse:author rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ '''
+ tupA = [('http://bsfs.ai/schema/Entity#author', 'Me, myself, and I')]
+ # constant B
+ csB = '''
+ bse:rating rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsfs:File ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+ '''
+ tupB = [('http://bsfs.ai/schema/Entity#rating', 123)]
+ # extractors/readers
+ self.ext2rdr = {
+ bsie.extractor.generic.path.Path(): bsie.reader.path.Path(),
+ bsie.extractor.generic.stat.Stat(): bsie.reader.stat.Stat(),
+ bsie.extractor.generic.constant.Constant(csA, tupA): None,
+ bsie.extractor.generic.constant.Constant(csB, tupB): None,
+ }
+ self.prefix = bsfs.Namespace('http://example.com/local/')
+
+ def test_essentials(self):
+ pipeline = Pipeline(self.prefix, self.ext2rdr)
+ self.assertEqual(str(pipeline), 'Pipeline')
+ self.assertEqual(repr(pipeline), 'Pipeline(...)')
+
+ def test_equality(self):
+ pipeline = Pipeline(self.prefix, self.ext2rdr)
+ # a pipeline is equivalent to itself
+ self.assertEqual(pipeline, pipeline)
+ self.assertEqual(hash(pipeline), hash(pipeline))
+ # identical builds are equivalent
+ self.assertEqual(pipeline, Pipeline(self.prefix, self.ext2rdr))
+ self.assertEqual(hash(pipeline), hash(Pipeline(self.prefix, self.ext2rdr)))
+
+ # equivalence respects prefix
+ self.assertNotEqual(pipeline, Pipeline(bsfs.URI('http://example.com/global/ent#'), self.ext2rdr))
+ self.assertNotEqual(hash(pipeline), hash(Pipeline(bsfs.URI('http://example.com/global/ent#'), self.ext2rdr)))
+ # equivalence respects extractors/readers
+ ext2rdr = {ext: rdr for idx, (ext, rdr) in enumerate(self.ext2rdr.items()) if idx % 2 == 0}
+ self.assertNotEqual(pipeline, Pipeline(self.prefix, ext2rdr))
+ self.assertNotEqual(hash(pipeline), hash(Pipeline(self.prefix, ext2rdr)))
+
+ # equivalence respects schema
+ p2 = Pipeline(self.prefix, self.ext2rdr)
+ p2._schema = pipeline.schema.Empty()
+ self.assertNotEqual(pipeline, p2)
+ self.assertNotEqual(hash(pipeline), hash(p2))
+
+ # not equal to other types
+ class Foo(): pass
+ self.assertNotEqual(pipeline, Foo())
+ self.assertNotEqual(hash(pipeline), hash(Foo()))
+ self.assertNotEqual(pipeline, 123)
+ self.assertNotEqual(hash(pipeline), hash(123))
+ self.assertNotEqual(pipeline, None)
+ self.assertNotEqual(hash(pipeline), hash(None))
+
+
+ def test_call(self):
+ # build pipeline
+ pipeline = Pipeline(self.prefix, self.ext2rdr)
+ # build objects for tests
+ content_hash = 'a948904f2f0f479b8f8197694b30184b0d2ed1c1cd2a1ec0fb85d299a192a447'
+ subject = node.Node(ns.bsfs.File, (self.prefix + 'file#')[content_hash])
+ testfile = os.path.join(os.path.dirname(__file__), 'testfile.t')
+ p_filename = pipeline.schema.predicate(ns.bse.filename)
+ p_filesize = pipeline.schema.predicate(ns.bse.filesize)
+ p_author = pipeline.schema.predicate(ns.bse.author)
+ p_rating = pipeline.schema.predicate(ns.bse.rating)
+ entity = pipeline.schema.node(ns.bsfs.File)
+ p_invalid = pipeline.schema.predicate(ns.bsfs.Predicate).get_child(ns.bse.foo, range=entity)
+
+ # extract given predicates
+ self.assertSetEqual(set(pipeline(testfile, {p_filename, p_filesize})), {
+ (subject, p_filename, 'testfile.t'),
+ (subject, p_filesize, 12),
+ })
+ self.assertSetEqual(set(pipeline(testfile, {p_author})), {
+ (subject, p_author, 'Me, myself, and I'),
+ })
+ self.assertSetEqual(set(pipeline(testfile, {p_filename})), {
+ (subject, p_filename, 'testfile.t'),
+ })
+ self.assertSetEqual(set(pipeline(testfile, {p_filesize})), {
+ (subject, p_filesize, 12),
+ })
+ # extract all predicates
+ self.assertSetEqual(set(pipeline(testfile)), {
+ (subject, p_filename, 'testfile.t'),
+ (subject, p_filesize, 12),
+ (subject, p_author, 'Me, myself, and I'),
+ (subject, p_rating, 123),
+ })
+ # invalid predicate
+ self.assertSetEqual(set(pipeline(testfile, {p_invalid})), set())
+ # valid/invalid predicates mixed
+ self.assertSetEqual(set(pipeline(testfile, {p_filename, p_invalid})), {
+ (subject, p_filename, 'testfile.t'),
+ })
+ # invalid path
+ self.assertRaises(FileNotFoundError, list, pipeline('inexistent_file'))
+ # FIXME: unreadable file (e.g. permissions error)
+
+ def test_call_reader_err(self):
+ class FaultyReader(bsie.reader.path.Path):
+ def __call__(self, path):
+ raise errors.ReaderError('reader error')
+
+ pipeline = Pipeline(self.prefix, {bsie.extractor.generic.path.Path(): FaultyReader()})
+ with self.assertLogs(logging.getLogger('bsie.tools.pipeline'), logging.ERROR):
+ testfile = os.path.join(os.path.dirname(__file__), 'testfile.t')
+ p_filename = pipeline.schema.predicate(ns.bse.filename)
+ self.assertSetEqual(set(pipeline(testfile, {p_filename})), set())
+
+ def test_call_extractor_err(self):
+ class FaultyExtractor(bsie.extractor.generic.path.Path):
+ def extract(self, subject, content, predicates):
+ raise errors.ExtractorError('extractor error')
+
+ pipeline = Pipeline(self.prefix, {FaultyExtractor(): bsie.reader.path.Path()})
+ with self.assertLogs(logging.getLogger('bsie.tools.pipeline'), logging.ERROR):
+ testfile = os.path.join(os.path.dirname(__file__), 'testfile.t')
+ p_filename = pipeline.schema.predicate(ns.bse.filename)
+ self.assertSetEqual(set(pipeline(testfile, {p_filename})), set())
+
+ def test_predicates(self):
+ # build pipeline
+ pipeline = Pipeline(self.prefix, self.ext2rdr)
+ #
+ self.assertSetEqual(set(pipeline.principals), {
+ pipeline.schema.predicate(ns.bse.filename),
+ pipeline.schema.predicate(ns.bse.filesize),
+ pipeline.schema.predicate(ns.bse.author),
+ pipeline.schema.predicate(ns.bse.rating),
+ })
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##
diff --git a/test/tools/testfile.t b/test/tools/testfile.t
new file mode 100644
index 0000000..3b18e51
--- /dev/null
+++ b/test/tools/testfile.t
@@ -0,0 +1 @@
+hello world
diff --git a/test/utils/__init__.py b/test/utils/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/utils/__init__.py
diff --git a/test/utils/test_node.py b/test/utils/test_node.py
new file mode 100644
index 0000000..c70f0b8
--- /dev/null
+++ b/test/utils/test_node.py
@@ -0,0 +1,65 @@
+"""
+
+Part of the bsie test suite.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# imports
+import unittest
+
+# bsie imports
+from bsie.utils import bsfs, ns
+
+# objects to test
+from bsie.utils.node import Node
+
+
+## code ##
+
+class TestNode(unittest.TestCase):
+ def test_equality(self):
+ uri = bsfs.URI('http://example.com/me/entity#1234')
+ node = Node(ns.bsfs.Entity, uri)
+ # basic equivalence
+ self.assertEqual(node, Node(ns.bsfs.Entity, bsfs.URI('http://example.com/me/entity#1234')))
+ self.assertEqual(hash(node), hash(Node(ns.bsfs.Entity, bsfs.URI('http://example.com/me/entity#1234'))))
+ # equality respects uri
+ self.assertNotEqual(node, Node(ns.bsfs.Entity, bsfs.URI('http://example.com/me/entity#4321')))
+ self.assertNotEqual(hash(node), hash(Node(ns.bsfs.Entity, bsfs.URI('http://example.com/me/entity#4321'))))
+ # equality respects node_type
+ self.assertNotEqual(node, Node(ns.bsfs.Foo, uri))
+ self.assertNotEqual(hash(node), hash(Node(ns.bsfs.Foo, uri)))
+ # not equal to other types
+ self.assertNotEqual(node, 1234)
+ self.assertNotEqual(hash(node), hash(1234))
+ self.assertNotEqual(node, uri)
+ self.assertNotEqual(hash(node), hash(uri))
+ self.assertNotEqual(node, ns.bsfs.Entity)
+ self.assertNotEqual(hash(node), hash(ns.bsfs.Entity))
+ class Foo(): pass
+ self.assertNotEqual(node, Foo())
+ self.assertNotEqual(hash(node), hash(Foo()))
+
+ def test_str(self):
+ uri = bsfs.URI('http://example.com/me/entity#1234')
+ # basic string conversion
+ node = Node(ns.bsfs.Entity, uri)
+ self.assertEqual(str(node), 'Node(http://bsfs.ai/schema/Entity, http://example.com/me/entity#1234)')
+ self.assertEqual(repr(node), 'Node(http://bsfs.ai/schema/Entity, http://example.com/me/entity#1234)')
+ # string conversion respects node_type
+ node = Node(ns.bsfs.Foo, uri)
+ self.assertEqual(str(node), 'Node(http://bsfs.ai/schema/Foo, http://example.com/me/entity#1234)')
+ self.assertEqual(repr(node), 'Node(http://bsfs.ai/schema/Foo, http://example.com/me/entity#1234)')
+ # string conversion respects uri
+ node = Node(ns.bsfs.Entity, bsfs.URI('http://example.com/me/entity#4321'))
+ self.assertEqual(str(node), 'Node(http://bsfs.ai/schema/Entity, http://example.com/me/entity#4321)')
+ self.assertEqual(repr(node), 'Node(http://bsfs.ai/schema/Entity, http://example.com/me/entity#4321)')
+
+
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##