diff options
author | Matthias Baumgartner <dev@igsor.net> | 2022-12-18 14:22:31 +0100 |
---|---|---|
committer | Matthias Baumgartner <dev@igsor.net> | 2022-12-18 14:22:31 +0100 |
commit | 7582c280ad5324a2f0427999911c7e7abc14a6ab (patch) | |
tree | 0a59bbfe1c44d3497daad9f25ff9e7eb2bf9eb82 /bsie/base | |
parent | cb49e4567a18de6851286ff672e54f9a91865fe9 (diff) | |
parent | 057e09d6537bf5c39815661a75819081e3e5fda7 (diff) | |
download | bsie-7582c280ad5324a2f0427999911c7e7abc14a6ab.tar.gz bsie-7582c280ad5324a2f0427999911c7e7abc14a6ab.tar.bz2 bsie-7582c280ad5324a2f0427999911c7e7abc14a6ab.zip |
Merge branch 'develop' into main
Diffstat (limited to 'bsie/base')
-rw-r--r-- | bsie/base/__init__.py | 24 | ||||
-rw-r--r-- | bsie/base/errors.py | 42 | ||||
-rw-r--r-- | bsie/base/extractor.py | 103 | ||||
-rw-r--r-- | bsie/base/reader.py | 47 |
4 files changed, 216 insertions, 0 deletions
diff --git a/bsie/base/__init__.py b/bsie/base/__init__.py new file mode 100644 index 0000000..0d362cd --- /dev/null +++ b/bsie/base/__init__.py @@ -0,0 +1,24 @@ +"""The base module defines the BSIE interfaces. + +You'll mostly find abstract classes here. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# inner-module imports +from . import errors +from .extractor import Extractor +from .reader import Reader + +# exports +__all__: typing.Sequence[str] = ( + 'Extractor', + 'Reader', + 'errors', + ) + +## EOF ## diff --git a/bsie/base/errors.py b/bsie/base/errors.py new file mode 100644 index 0000000..dc3c30e --- /dev/null +++ b/bsie/base/errors.py @@ -0,0 +1,42 @@ +"""Common BSIE exceptions. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import typing + +# exports +__all__: typing.Sequence[str] = ( + 'BuilderError', + 'ExtractorError', + 'LoaderError', + 'ReaderError', + ) + + +## code ## + +class _BSIEError(Exception): + """Generic BSIE error.""" + +class BuilderError(_BSIEError): + """The Builder failed to create an instance.""" + +class LoaderError(BuilderError): + """Failed to load a module or class.""" + +class ExtractorError(_BSIEError): + """The Extractor failed to process the given content.""" + +class ReaderError(_BSIEError): + """The Reader failed to read the given file.""" + +class ProgrammingError(_BSIEError): + """An assertion-like error that indicates a code-base issue.""" + +class UnreachableError(ProgrammingError): + """Bravo, you've reached a point in code that should logically not be reachable.""" + +## EOF ## diff --git a/bsie/base/extractor.py b/bsie/base/extractor.py new file mode 100644 index 0000000..c44021b --- /dev/null +++ b/bsie/base/extractor.py @@ -0,0 +1,103 @@ +"""The Extractor classes transform content into triples. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import abc +import typing + +# bsie imports +from bsie.utils import bsfs, node, ns + +# exports +__all__: typing.Sequence[str] = ( + 'Extractor', + ) + +# constants + +# essential definitions typically used in extractor schemas. +# NOTE: This preamble is only for convenience; Each Extractor must implement its use, if so desired. +SCHEMA_PREAMBLE = ''' + # common external prefixes + prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> + prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> + prefix xsd: <http://www.w3.org/2001/XMLSchema#> + prefix schema: <http://schema.org/> + + # common bsfs prefixes + prefix bsfs: <http://bsfs.ai/schema/> + prefix bse: <http://bsfs.ai/schema/Entity#> + + # essential nodes + bsfs:Entity rdfs:subClassOf bsfs:Node . + bsfs:File rdfs:subClassOf bsfs:Entity . + + # common definitions + xsd:string rdfs:subClassOf bsfs:Literal . + xsd:integer rdfs:subClassOf bsfs:Literal . + + ''' + + +## code ## + +class Extractor(abc.ABC): + """Produce (subject, predicate, value)-triples from some content. + The Extractor produces princpal predicates that provide information + about the content itself (i.e., triples that include the subject), + and may also generate triples with auxiliary predicates if the + extracted value is a node itself. + """ + + # what type of content is expected (i.e. reader subclass). + CONTENT_READER: typing.Optional[str] = None + + # extractor schema. + _schema: bsfs.schema.Schema + + def __init__(self, schema: bsfs.schema.Schema): + self._schema = schema + + def __str__(self) -> str: + return bsfs.typename(self) + + def __repr__(self) -> str: + return f'{bsfs.typename(self)}()' + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) \ + and self.CONTENT_READER == other.CONTENT_READER \ + and self.schema == other.schema + + def __hash__(self) -> int: + return hash((type(self), self.CONTENT_READER, self.schema)) + + @property + def schema(self) -> bsfs.schema.Schema: + """Return the extractor's schema.""" + return self._schema + + @property + def principals(self) -> typing.Iterator[bsfs.schema.Predicate]: + """Return the principal predicates, i.e., relations from/to the extraction subject.""" + ent = self.schema.node(ns.bsfs.Entity) + return ( + pred + for pred + in self.schema.predicates() + if pred.domain <= ent or (pred.range is not None and pred.range <= ent) + ) + + @abc.abstractmethod + def extract( + self, + subject: node.Node, + content: typing.Any, + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: + """Return (node, predicate, value) triples.""" + +## EOF ## diff --git a/bsie/base/reader.py b/bsie/base/reader.py new file mode 100644 index 0000000..cbabd36 --- /dev/null +++ b/bsie/base/reader.py @@ -0,0 +1,47 @@ +"""The Reader classes return high-level content structures from files. + +The Reader fulfills two purposes: + First, it brokers between multiple libraries and file formats. + Second, it separates multiple aspects of a file into distinct content types. + +Part of the bsie module. +A copy of the license is provided with the project. +Author: Matthias Baumgartner, 2022 +""" +# imports +import abc +import typing + +# bsie imports +from bsie.utils import bsfs + +# exports +__all__: typing.Sequence[str] = ( + 'Reader', + ) + + +## code ## + +class Reader(abc.ABC): + """Read and return some content from a file.""" + + def __str__(self) -> str: + return bsfs.typename(self) + + def __repr__(self) -> str: + return f'{bsfs.typename(self)}()' + + def __eq__(self, other: typing.Any) -> bool: + return isinstance(other, type(self)) + + def __hash__(self) -> int: + return hash(type(self)) + + @abc.abstractmethod + def __call__(self, path: bsfs.URI) -> typing.Any: + """Return some content of the file at *path*. + Raises a `ReaderError` if the reader cannot make sense of the file format. + """ + +## EOF ## |