diff options
Diffstat (limited to 'bsie/base/extractor.py')
-rw-r--r-- | bsie/base/extractor.py | 103 |
1 files changed, 0 insertions, 103 deletions
diff --git a/bsie/base/extractor.py b/bsie/base/extractor.py deleted file mode 100644 index c44021b..0000000 --- a/bsie/base/extractor.py +++ /dev/null @@ -1,103 +0,0 @@ -"""The Extractor classes transform content into triples. - -Part of the bsie module. -A copy of the license is provided with the project. -Author: Matthias Baumgartner, 2022 -""" -# imports -import abc -import typing - -# bsie imports -from bsie.utils import bsfs, node, ns - -# exports -__all__: typing.Sequence[str] = ( - 'Extractor', - ) - -# constants - -# essential definitions typically used in extractor schemas. -# NOTE: This preamble is only for convenience; Each Extractor must implement its use, if so desired. -SCHEMA_PREAMBLE = ''' - # common external prefixes - prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> - prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> - prefix xsd: <http://www.w3.org/2001/XMLSchema#> - prefix schema: <http://schema.org/> - - # common bsfs prefixes - prefix bsfs: <http://bsfs.ai/schema/> - prefix bse: <http://bsfs.ai/schema/Entity#> - - # essential nodes - bsfs:Entity rdfs:subClassOf bsfs:Node . - bsfs:File rdfs:subClassOf bsfs:Entity . - - # common definitions - xsd:string rdfs:subClassOf bsfs:Literal . - xsd:integer rdfs:subClassOf bsfs:Literal . - - ''' - - -## code ## - -class Extractor(abc.ABC): - """Produce (subject, predicate, value)-triples from some content. - The Extractor produces princpal predicates that provide information - about the content itself (i.e., triples that include the subject), - and may also generate triples with auxiliary predicates if the - extracted value is a node itself. - """ - - # what type of content is expected (i.e. reader subclass). - CONTENT_READER: typing.Optional[str] = None - - # extractor schema. - _schema: bsfs.schema.Schema - - def __init__(self, schema: bsfs.schema.Schema): - self._schema = schema - - def __str__(self) -> str: - return bsfs.typename(self) - - def __repr__(self) -> str: - return f'{bsfs.typename(self)}()' - - def __eq__(self, other: typing.Any) -> bool: - return isinstance(other, type(self)) \ - and self.CONTENT_READER == other.CONTENT_READER \ - and self.schema == other.schema - - def __hash__(self) -> int: - return hash((type(self), self.CONTENT_READER, self.schema)) - - @property - def schema(self) -> bsfs.schema.Schema: - """Return the extractor's schema.""" - return self._schema - - @property - def principals(self) -> typing.Iterator[bsfs.schema.Predicate]: - """Return the principal predicates, i.e., relations from/to the extraction subject.""" - ent = self.schema.node(ns.bsfs.Entity) - return ( - pred - for pred - in self.schema.predicates() - if pred.domain <= ent or (pred.range is not None and pred.range <= ent) - ) - - @abc.abstractmethod - def extract( - self, - subject: node.Node, - content: typing.Any, - principals: typing.Iterable[bsfs.schema.Predicate], - ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: - """Return (node, predicate, value) triples.""" - -## EOF ## |