aboutsummaryrefslogtreecommitdiffstats
path: root/bsie/base/extractor.py
blob: 2fc4f1891246a21e121b67981d13c2a77c934b3a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""The Extractor classes transform content into triples.

Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# imports
import abc
import typing

# bsie imports
from bsie.utils import node
from bsie.utils.bsfs import schema as _schema, typename

# exports
__all__: typing.Sequence[str] = (
    'Extractor',
    )

# constants

# essential definitions typically used in extractor schemas.
# NOTE: The definition here is only for convenience; Each Extractor must implement its use, if so desired.
SCHEMA_PREAMBLE = '''
    # common external prefixes
    prefix owl: <http://www.w3.org/2002/07/owl#>
    prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    prefix xsd: <http://www.w3.org/2001/XMLSchema#>
    prefix schema: <http://schema.org/>

    # common bsfs prefixes
    prefix bsfs: <http://bsfs.ai/schema/>
    prefix bse: <http://bsfs.ai/schema/Entity#>

    # essential nodes
    bsfs:Entity rdfs:subClassOf bsfs:Node .

    # common definitions
    xsd:string rdfs:subClassOf bsfs:Literal .
    xsd:integer rdfs:subClassOf bsfs:Literal .

    '''


## code ##

class Extractor(abc.ABC):
    """Produce (node, predicate, value)-triples from some content."""

    # what type of content is expected (i.e. reader subclass).
    CONTENT_READER: typing.Optional[str] = None

    # extractor schema.
    schema: _schema.Schema

    def __init__(self, schema: _schema.Schema):
        self.schema = schema

    def __str__(self) -> str:
        return typename(self)

    def __repr__(self) -> str:
        return f'{typename(self)}()'

    def __eq__(self, other: typing.Any) -> bool:
        return isinstance(other, type(self)) \
          and self.CONTENT_READER == other.CONTENT_READER \
          and self.schema == other.schema

    def __hash__(self) -> int:
        return hash((type(self), self.CONTENT_READER, self.schema))

    def predicates(self) -> typing.Iterator[_schema.Predicate]:
        """Return the predicates that may be part of extracted triples."""
        # NOTE: Some predicates in the schema might not occur in actual triples,
        # but are defined due to predicate class hierarchy. E.g., bsfs:Predicate
        # is part of every schema but should not be used in triples.
        # Announcing all predicates might not be the most efficient way, however,
        # it is the most safe one. Concrete extractors that produce additional
        # predicates (e.g. auxiliary nodes with their own predicates) should
        # overwrite this method to only include the principal predicates.
        return self.schema.predicates()

    @abc.abstractmethod
    def extract(
            self,
            subject: node.Node,
            content: typing.Any,
            predicates: typing.Iterable[_schema.Predicate],
            ) -> typing.Iterator[typing.Tuple[node.Node, _schema.Predicate, typing.Any]]:
        """Return (node, predicate, value) triples."""

## EOF ##