"""The Extractor classes transform content into triples. Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ # standard imports import abc import typing # bsie imports from bsie.utils import bsfs, node, ns # exports __all__: typing.Sequence[str] = ( 'Extractor', ) # constants # essential definitions typically used in extractor schemas. # NOTE: This preamble is only for convenience; Each Extractor must implement its use, if so desired. SCHEMA_PREAMBLE = ''' # common external prefixes prefix rdf: prefix rdfs: prefix xsd: prefix schema: # common bsfs prefixes prefix bsfs: prefix bse: # essential nodes bsfs:Entity rdfs:subClassOf bsfs:Node . bsfs:File rdfs:subClassOf bsfs:Entity . # common definitions xsd:string rdfs:subClassOf bsfs:Literal . xsd:integer rdfs:subClassOf bsfs:Literal . ''' ## code ## class Extractor(abc.ABC): """Produce (subject, predicate, value)-triples from some content. The Extractor produces princpal predicates that provide information about the content itself (i.e., triples that include the subject), and may also generate triples with auxiliary predicates if the extracted value is a node itself. """ # what type of content is expected (i.e. reader subclass). CONTENT_READER: typing.Optional[str] = None # extractor schema. _schema: bsfs.schema.Schema def __init__(self, schema: bsfs.schema.Schema): self._schema = schema def __str__(self) -> str: return bsfs.typename(self) def __repr__(self) -> str: return f'{bsfs.typename(self)}()' def __eq__(self, other: typing.Any) -> bool: return isinstance(other, type(self)) \ and self.CONTENT_READER == other.CONTENT_READER \ and self.schema == other.schema def __hash__(self) -> int: return hash((type(self), self.CONTENT_READER, self.schema)) @property def schema(self) -> bsfs.schema.Schema: """Return the extractor's schema.""" return self._schema @property def principals(self) -> typing.Iterator[bsfs.schema.Predicate]: """Return the principal predicates, i.e., relations from/to the extraction subject.""" ent = self.schema.node(ns.bsfs.Entity) return ( pred for pred in self.schema.predicates() if pred.domain <= ent or (pred.range is not None and pred.range <= ent) ) @abc.abstractmethod def extract( self, subject: node.Node, content: typing.Any, principals: typing.Iterable[bsfs.schema.Predicate], ) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]: """Return (node, predicate, value) triples.""" ## EOF ##