"""The Extractor classes transform content into triples. Part of the bsie module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ # imports import abc import typing # bsie imports from bsie.utils import node from bsie.utils.bsfs import schema as _schema, typename # exports __all__: typing.Sequence[str] = ( 'Extractor', ) # constants # essential definitions typically used in extractor schemas. # NOTE: The definition here is only for convenience; Each Extractor must implement its use, if so desired. SCHEMA_PREAMBLE = ''' # common external prefixes prefix owl: prefix rdf: prefix rdfs: prefix xsd: prefix schema: # common bsfs prefixes prefix bsfs: prefix bse: # essential nodes bsfs:Entity rdfs:subClassOf bsfs:Node . # common definitions xsd:string rdfs:subClassOf bsfs:Literal . xsd:integer rdfs:subClassOf bsfs:Literal . ''' ## code ## class Extractor(abc.ABC): """Produce (node, predicate, value)-triples from some content.""" # what type of content is expected (i.e. reader subclass). CONTENT_READER: typing.Optional[str] = None # extractor schema. schema: _schema.Schema def __init__(self, schema: _schema.Schema): self.schema = schema def __str__(self) -> str: return typename(self) def __repr__(self) -> str: return f'{typename(self)}()' def __eq__(self, other: typing.Any) -> bool: return isinstance(other, type(self)) \ and self.CONTENT_READER == other.CONTENT_READER \ and self.schema == other.schema def __hash__(self) -> int: return hash((type(self), self.CONTENT_READER, self.schema)) def predicates(self) -> typing.Iterator[_schema.Predicate]: """Return the predicates that may be part of extracted triples.""" # NOTE: Some predicates in the schema might not occur in actual triples, # but are defined due to predicate class hierarchy. E.g., bsfs:Predicate # is part of every schema but should not be used in triples. # Announcing all predicates might not be the most efficient way, however, # it is the most safe one. Concrete extractors that produce additional # predicates (e.g. auxiliary nodes with their own predicates) should # overwrite this method to only include the principal predicates. return self.schema.predicates() @abc.abstractmethod def extract( self, subject: node.Node, content: typing.Any, predicates: typing.Iterable[_schema.Predicate], ) -> typing.Iterator[typing.Tuple[node.Node, _schema.Predicate, typing.Any]]: """Return (node, predicate, value) triples.""" ## EOF ##